From bcac5d15f6e9b7b2cbb61d9725ea29007d1c6426 Mon Sep 17 00:00:00 2001 From: Mikers Date: Fri, 27 Mar 2026 17:20:49 -1000 Subject: [PATCH 01/61] treedb: expose vlog generation maintenance bytes in stats --- TreeDB/caching/db.go | 216 +++++++++++------- TreeDB/caching/expvar_stats.go | 1 + TreeDB/caching/expvar_stats_test.go | 4 + .../caching/vlog_generation_scheduler_test.go | 47 ++++ 4 files changed, 184 insertions(+), 84 deletions(-) diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go index b38354bec..6a1255668 100644 --- a/TreeDB/caching/db.go +++ b/TreeDB/caching/db.go @@ -5142,90 +5142,94 @@ type DB struct { valueLogMaxSegmentBytes int64 journalCompression bool - disableJournal bool - relaxedSync bool - notifyError func(error) - debugFlushPointers bool - debugFlushTiming bool - debugPtrEligible atomic.Int64 - debugPtrUsed atomic.Int64 - debugPtrNoPtr atomic.Int64 - debugPtrDenied atomic.Int64 - debugPtrDisabled atomic.Int64 - checkpointRuns atomic.Uint64 - checkpointTotalNs atomic.Uint64 - checkpointMaxNs atomic.Uint64 - checkpointNoopSkips atomic.Uint64 - checkpointFlushMuWaitNs atomic.Uint64 - checkpointFlushMuWaitMaxNs atomic.Uint64 - checkpointAutoVacuumRuns atomic.Uint64 - checkpointAutoVacuumLastCheckRun atomic.Uint64 - checkpointAutoVacuumLastPages atomic.Uint64 - checkpointAutoVacuumLastInternalP50 atomic.Uint64 - checkpointAutoVacuumLastInternalAvg atomic.Uint64 - lastForegroundWriteUnixNano atomic.Int64 - lastForegroundReadUnixNano atomic.Int64 - foregroundReadStampCounter atomic.Uint32 - activeForegroundIterators atomic.Int64 - retainedPruneLastStartUnixNano atomic.Int64 - retainedPruneMu sync.Mutex - retainedPruneDone chan struct{} - vlogGenerationRemapSuccesses atomic.Uint64 - vlogGenerationRemapFailures atomic.Uint64 - vlogGenerationRewriteBytesIn atomic.Uint64 - vlogGenerationRewriteBytesOut atomic.Uint64 - vlogGenerationRewriteRuns atomic.Uint64 - vlogGenerationRewritePlanRuns atomic.Uint64 - vlogGenerationRewritePlanCanceled atomic.Uint64 - vlogGenerationRewritePlanErrors atomic.Uint64 - vlogGenerationRewritePlanEmpty atomic.Uint64 - vlogGenerationRewritePlanSelected atomic.Uint64 - vlogGenerationRewritePlanCanceledLastNS atomic.Int64 - vlogGenerationRewriteAgeBlockedUntilNS atomic.Int64 - vlogGenerationRewriteAgeBlockedWakeRunning atomic.Bool - vlogGenerationRewriteIneffectiveLastNS atomic.Int64 - vlogGenerationRewriteIneffectiveRuns atomic.Uint64 - vlogGenerationRewriteIneffectiveBytesIn atomic.Uint64 - vlogGenerationRewriteIneffectiveBytesOut atomic.Uint64 - vlogGenerationRewriteCanceledRuns atomic.Uint64 - vlogGenerationRewriteCanceledLastNS atomic.Int64 - vlogGenerationRewriteQueuePruneRuns atomic.Uint64 - vlogGenerationRewriteQueuePruneIDs atomic.Uint64 - vlogGenerationGCSegmentsDeleted atomic.Uint64 - vlogGenerationGCBytesDeleted atomic.Uint64 - vlogGenerationGCRuns atomic.Uint64 - vlogGenerationVacuumRuns atomic.Uint64 - vlogGenerationVacuumFailures atomic.Uint64 - vlogGenerationLastVacuumUnixNano atomic.Int64 - vlogGenerationLastRewritePlanUnixNano atomic.Int64 - vlogGenerationLastRewriteUnixNano atomic.Int64 - vlogGenerationLastGCUnixNano atomic.Int64 - vlogGenerationLastCheckpointKickUnixNano atomic.Int64 - vlogGenerationLastGCDryRunUnixNano atomic.Int64 - vlogGenerationLastGCDryRunBytesEligible atomic.Int64 - vlogGenerationLastGCDryRunSegsEligible atomic.Int64 - vlogGenerationChurnBytes atomic.Uint64 - vlogGenerationSchedulerState atomic.Uint32 - vlogGenerationMaintenanceActive atomic.Bool - vlogGenerationLastReason atomic.Uint32 - vlogGenerationCheckpointKickRuns atomic.Uint64 - vlogGenerationCheckpointKickRewriteRuns atomic.Uint64 - vlogGenerationCheckpointKickGCRuns atomic.Uint64 - vlogGenerationCheckpointKickPending atomic.Bool - vlogGenerationDeferredMaintenancePending atomic.Bool - vlogGenerationDeferredMaintenanceRunning atomic.Bool - vlogGenerationRewriteStageWakeObservedNS atomic.Int64 - vlogGenerationRewriteQueueMu sync.Mutex - vlogGenerationCheckpointKickActive atomic.Bool - vlogGenerationRewriteQueue []uint32 - vlogGenerationRewriteLedger []backenddb.ValueLogRewritePlanSegment - vlogGenerationRewritePenalties map[uint32]valueLogGenerationRewritePenalty - vlogGenerationRewriteStagePending bool - vlogGenerationRewriteStageObservedUnixNano int64 - vlogGenerationRewriteQueueLoaded bool - vlogGenerationLastChurnBps atomic.Int64 - vlogGenerationLastChurnSampleBytes atomic.Uint64 - vlogGenerationLastChurnSampleNS atomic.Int64 + disableJournal bool + relaxedSync bool + notifyError func(error) + debugFlushPointers bool + debugFlushTiming bool + debugPtrEligible atomic.Int64 + debugPtrUsed atomic.Int64 + debugPtrNoPtr atomic.Int64 + debugPtrDenied atomic.Int64 + debugPtrDisabled atomic.Int64 + checkpointRuns atomic.Uint64 + checkpointTotalNs atomic.Uint64 + checkpointMaxNs atomic.Uint64 + checkpointNoopSkips atomic.Uint64 + checkpointFlushMuWaitNs atomic.Uint64 + checkpointFlushMuWaitMaxNs atomic.Uint64 + checkpointAutoVacuumRuns atomic.Uint64 + checkpointAutoVacuumLastCheckRun atomic.Uint64 + checkpointAutoVacuumLastPages atomic.Uint64 + checkpointAutoVacuumLastInternalP50 atomic.Uint64 + checkpointAutoVacuumLastInternalAvg atomic.Uint64 + lastForegroundWriteUnixNano atomic.Int64 + lastForegroundReadUnixNano atomic.Int64 + foregroundReadStampCounter atomic.Uint32 + activeForegroundIterators atomic.Int64 + retainedPruneLastStartUnixNano atomic.Int64 + retainedPruneMu sync.Mutex + retainedPruneDone chan struct{} + vlogGenerationRemapSuccesses atomic.Uint64 + vlogGenerationRemapFailures atomic.Uint64 + vlogGenerationRewriteBytesIn atomic.Uint64 + vlogGenerationRewriteBytesOut atomic.Uint64 + vlogGenerationRewriteReclaimedBytes atomic.Uint64 + vlogGenerationRewriteRuns atomic.Uint64 + vlogGenerationRewritePlanRuns atomic.Uint64 + vlogGenerationRewritePlanCanceled atomic.Uint64 + vlogGenerationRewritePlanErrors atomic.Uint64 + vlogGenerationRewritePlanEmpty atomic.Uint64 + vlogGenerationRewritePlanSelected atomic.Uint64 + vlogGenerationRewritePlanSelectedBytes atomic.Uint64 + vlogGenerationRewritePlanSelectedLiveBytes atomic.Uint64 + vlogGenerationRewritePlanSelectedStaleBytes atomic.Uint64 + vlogGenerationRewritePlanCanceledLastNS atomic.Int64 + vlogGenerationRewriteAgeBlockedUntilNS atomic.Int64 + vlogGenerationRewriteAgeBlockedWakeRunning atomic.Bool + vlogGenerationRewriteIneffectiveLastNS atomic.Int64 + vlogGenerationRewriteIneffectiveRuns atomic.Uint64 + vlogGenerationRewriteIneffectiveBytesIn atomic.Uint64 + vlogGenerationRewriteIneffectiveBytesOut atomic.Uint64 + vlogGenerationRewriteCanceledRuns atomic.Uint64 + vlogGenerationRewriteCanceledLastNS atomic.Int64 + vlogGenerationRewriteQueuePruneRuns atomic.Uint64 + vlogGenerationRewriteQueuePruneIDs atomic.Uint64 + vlogGenerationGCSegmentsDeleted atomic.Uint64 + vlogGenerationGCBytesDeleted atomic.Uint64 + vlogGenerationGCRuns atomic.Uint64 + vlogGenerationVacuumRuns atomic.Uint64 + vlogGenerationVacuumFailures atomic.Uint64 + vlogGenerationLastVacuumUnixNano atomic.Int64 + vlogGenerationLastRewritePlanUnixNano atomic.Int64 + vlogGenerationLastRewriteUnixNano atomic.Int64 + vlogGenerationLastGCUnixNano atomic.Int64 + vlogGenerationLastCheckpointKickUnixNano atomic.Int64 + vlogGenerationLastGCDryRunUnixNano atomic.Int64 + vlogGenerationLastGCDryRunBytesEligible atomic.Int64 + vlogGenerationLastGCDryRunSegsEligible atomic.Int64 + vlogGenerationChurnBytes atomic.Uint64 + vlogGenerationSchedulerState atomic.Uint32 + vlogGenerationMaintenanceActive atomic.Bool + vlogGenerationLastReason atomic.Uint32 + vlogGenerationCheckpointKickRuns atomic.Uint64 + vlogGenerationCheckpointKickRewriteRuns atomic.Uint64 + vlogGenerationCheckpointKickGCRuns atomic.Uint64 + vlogGenerationCheckpointKickPending atomic.Bool + vlogGenerationDeferredMaintenancePending atomic.Bool + vlogGenerationDeferredMaintenanceRunning atomic.Bool + vlogGenerationRewriteStageWakeObservedNS atomic.Int64 + vlogGenerationRewriteQueueMu sync.Mutex + vlogGenerationCheckpointKickActive atomic.Bool + vlogGenerationRewriteQueue []uint32 + vlogGenerationRewriteLedger []backenddb.ValueLogRewritePlanSegment + vlogGenerationRewritePenalties map[uint32]valueLogGenerationRewritePenalty + vlogGenerationRewriteStagePending bool + vlogGenerationRewriteStageObservedUnixNano int64 + vlogGenerationRewriteQueueLoaded bool + vlogGenerationLastChurnBps atomic.Int64 + vlogGenerationLastChurnSampleBytes atomic.Uint64 + vlogGenerationLastChurnSampleNS atomic.Int64 // Rewrite budget token bucket (bytes) for online maintenance. This lets us // interpret ValueLogRewriteBudgetBytesPerSec as a true per-second bandwidth // budget while still running maintenance at coarse intervals. @@ -12431,6 +12435,43 @@ func (db *DB) observeVlogGenerationRewritePlanOutcome(plan backenddb.ValueLogRew } if len(plan.SourceFileIDs) > 0 || len(plan.SelectedSegments) > 0 || plan.SegmentsSelected > 0 { db.vlogGenerationRewritePlanSelected.Add(1) + selectedTotal := plan.SelectedBytesTotal + selectedLive := plan.SelectedBytesLive + selectedStale := plan.SelectedBytesStale + if len(plan.SelectedSegments) > 0 && (selectedTotal <= 0 || selectedLive <= 0 || selectedStale <= 0) { + fallbackTotal := int64(0) + fallbackLive := int64(0) + fallbackStale := int64(0) + for _, seg := range plan.SelectedSegments { + if seg.BytesTotal > 0 { + fallbackTotal += seg.BytesTotal + } + if seg.BytesLive > 0 { + fallbackLive += seg.BytesLive + } + if seg.BytesStale > 0 { + fallbackStale += seg.BytesStale + } + } + if selectedTotal <= 0 { + selectedTotal = fallbackTotal + } + if selectedLive <= 0 { + selectedLive = fallbackLive + } + if selectedStale <= 0 { + selectedStale = fallbackStale + } + } + if selectedTotal > 0 { + db.vlogGenerationRewritePlanSelectedBytes.Add(uint64(selectedTotal)) + } + if selectedLive > 0 { + db.vlogGenerationRewritePlanSelectedLiveBytes.Add(uint64(selectedLive)) + } + if selectedStale > 0 { + db.vlogGenerationRewritePlanSelectedStaleBytes.Add(uint64(selectedStale)) + } return } db.vlogGenerationRewritePlanEmpty.Add(1) @@ -13747,6 +13788,9 @@ planned: } db.debugVlogMaintf("gc_after_rewrite_done reason=%s dur_ms=%.3f", vlogGenerationReasonString(reason), float64(time.Since(gcStart).Microseconds())/1000) } + if effectiveBytesBefore > effectiveBytesAfter { + db.vlogGenerationRewriteReclaimedBytes.Add(uint64(effectiveBytesBefore - effectiveBytesAfter)) + } locallyEffectiveProcessedDebt := len(processedRewriteIDs) > 0 && processedLedgerOK && processedLedgerStaleBytes > 0 && stats.RecordsCopied > 0 if effectiveBytesBefore > 0 && effectiveBytesAfter >= effectiveBytesBefore && !locallyEffectiveProcessedDebt { db.vlogGenerationRewriteIneffectiveRuns.Add(1) @@ -19486,6 +19530,9 @@ func (db *DB) Stats() map[string]string { stats["treedb.cache.vlog_generation.rewrite.plan_errors"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanErrors.Load()) stats["treedb.cache.vlog_generation.rewrite.plan_empty"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanEmpty.Load()) stats["treedb.cache.vlog_generation.rewrite.plan_selected"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanSelected.Load()) + stats["treedb.cache.vlog_generation.rewrite.plan_selected_bytes_total"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanSelectedBytes.Load()) + stats["treedb.cache.vlog_generation.rewrite.plan_selected_bytes_live"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanSelectedLiveBytes.Load()) + stats["treedb.cache.vlog_generation.rewrite.plan_selected_bytes_stale"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanSelectedStaleBytes.Load()) stats["treedb.cache.vlog_generation.rewrite.canceled_runs"] = fmt.Sprintf("%d", db.vlogGenerationRewriteCanceledRuns.Load()) stats["treedb.cache.vlog_generation.rewrite.canceled_last_unix_nano"] = fmt.Sprintf("%d", db.vlogGenerationRewriteCanceledLastNS.Load()) stats["treedb.cache.vlog_generation.rewrite.queue_prune_runs"] = fmt.Sprintf("%d", db.vlogGenerationRewriteQueuePruneRuns.Load()) @@ -19495,6 +19542,7 @@ func (db *DB) Stats() map[string]string { stats["treedb.cache.vlog_generation.rewrite.ineffective_bytes_out"] = fmt.Sprintf("%d", db.vlogGenerationRewriteIneffectiveBytesOut.Load()) stats["treedb.cache.vlog_generation.rewrite.ineffective_last_unix_nano"] = fmt.Sprintf("%d", db.vlogGenerationRewriteIneffectiveLastNS.Load()) stats["treedb.cache.vlog_generation.rewrite.ineffective_backoff_seconds"] = fmt.Sprintf("%.0f", vlogGenerationRewriteIneffectiveBackoff.Seconds()) + stats["treedb.cache.vlog_generation.rewrite.reclaimed_bytes"] = fmt.Sprintf("%d", db.vlogGenerationRewriteReclaimedBytes.Load()) stats["treedb.cache.vlog_generation.rewrite.plan_last_unix_nano"] = fmt.Sprintf("%d", db.vlogGenerationLastRewritePlanUnixNano.Load()) stats["treedb.cache.vlog_generation.rewrite.last_unix_nano"] = fmt.Sprintf("%d", db.vlogGenerationLastRewriteUnixNano.Load()) stats["treedb.cache.vlog_generation.gc.deleted_segments"] = fmt.Sprintf("%d", db.vlogGenerationGCSegmentsDeleted.Load()) diff --git a/TreeDB/caching/expvar_stats.go b/TreeDB/caching/expvar_stats.go index 8c3205ca0..5a7240f46 100644 --- a/TreeDB/caching/expvar_stats.go +++ b/TreeDB/caching/expvar_stats.go @@ -139,6 +139,7 @@ func selectTreeDBExpvarStats(stats map[string]string) map[string]any { strings.HasPrefix(k, "treedb.cache.vlog_payload_split.") || strings.HasPrefix(k, "treedb.cache.vlog_auto.") || strings.HasPrefix(k, "treedb.cache.vlog_dict.") || + strings.HasPrefix(k, "treedb.cache.vlog_generation.") || strings.HasPrefix(k, "treedb.cache.vlog_payload_kind.") || strings.HasPrefix(k, "treedb.cache.vlog_outer_leaf_codec.") || strings.HasPrefix(k, "treedb.cache.batch_arena.") { diff --git a/TreeDB/caching/expvar_stats_test.go b/TreeDB/caching/expvar_stats_test.go index f4de57519..ff1982510 100644 --- a/TreeDB/caching/expvar_stats_test.go +++ b/TreeDB/caching/expvar_stats_test.go @@ -28,6 +28,7 @@ func TestSelectTreeDBExpvarStatsFiltersAndCoerces(t *testing.T) { "treedb.cache.vlog_dict.current_k": "32", "treedb.cache.vlog_payload_kind.raw_bytes.single_value": "2048", "treedb.cache.vlog_outer_leaf_codec.raw_bytes.lz4": "512", + "treedb.cache.vlog_generation.rewrite.reclaimed_bytes": "1234", "treedb.process.memory.heap_inuse_bytes": "4096", "treedb.process.memory.pool_pressure_level": "critical", "treedb.cache.batch_arena.pool_bytes_estimate": "65536", @@ -80,6 +81,9 @@ func TestSelectTreeDBExpvarStatsFiltersAndCoerces(t *testing.T) { if v, ok := got["treedb.cache.vlog_outer_leaf_codec.raw_bytes.lz4"].(int64); !ok || v != 512 { t.Fatalf("vlog_outer_leaf_codec.raw_bytes.lz4=%T(%v) want int64(512)", got["treedb.cache.vlog_outer_leaf_codec.raw_bytes.lz4"], got["treedb.cache.vlog_outer_leaf_codec.raw_bytes.lz4"]) } + if v, ok := got["treedb.cache.vlog_generation.rewrite.reclaimed_bytes"].(int64); !ok || v != 1234 { + t.Fatalf("vlog_generation.rewrite.reclaimed_bytes=%T(%v) want int64(1234)", got["treedb.cache.vlog_generation.rewrite.reclaimed_bytes"], got["treedb.cache.vlog_generation.rewrite.reclaimed_bytes"]) + } if v, ok := got["treedb.process.memory.heap_inuse_bytes"].(int64); !ok || v != 4096 { t.Fatalf("heap_inuse_bytes=%T(%v) want int64(4096)", got["treedb.process.memory.heap_inuse_bytes"], got["treedb.process.memory.heap_inuse_bytes"]) } diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go index 0beb47e61..154ed646d 100644 --- a/TreeDB/caching/vlog_generation_scheduler_test.go +++ b/TreeDB/caching/vlog_generation_scheduler_test.go @@ -273,6 +273,53 @@ func TestShouldRunVlogGenerationRewrite_NoTrigger(t *testing.T) { } } +func TestObserveVlogGenerationRewritePlanOutcome_SelectedTracksBytes(t *testing.T) { + db := &DB{} + db.observeVlogGenerationRewritePlanOutcome(backenddb.ValueLogRewritePlan{ + SourceFileIDs: []uint32{11}, + SegmentsSelected: 1, + SelectedBytesTotal: 1024, + SelectedBytesLive: 640, + SelectedBytesStale: 384, + }, nil) + if got, want := db.vlogGenerationRewritePlanRuns.Load(), uint64(1); got != want { + t.Fatalf("plan runs=%d want=%d", got, want) + } + if got, want := db.vlogGenerationRewritePlanSelected.Load(), uint64(1); got != want { + t.Fatalf("plan selected=%d want=%d", got, want) + } + if got, want := db.vlogGenerationRewritePlanSelectedBytes.Load(), uint64(1024); got != want { + t.Fatalf("plan selected bytes total=%d want=%d", got, want) + } + if got, want := db.vlogGenerationRewritePlanSelectedLiveBytes.Load(), uint64(640); got != want { + t.Fatalf("plan selected bytes live=%d want=%d", got, want) + } + if got, want := db.vlogGenerationRewritePlanSelectedStaleBytes.Load(), uint64(384); got != want { + t.Fatalf("plan selected bytes stale=%d want=%d", got, want) + } +} + +func TestObserveVlogGenerationRewritePlanOutcome_SelectedTracksSegmentFallbackBytes(t *testing.T) { + db := &DB{} + db.observeVlogGenerationRewritePlanOutcome(backenddb.ValueLogRewritePlan{ + SourceFileIDs: []uint32{11, 22}, + SegmentsSelected: 2, + SelectedSegments: []backenddb.ValueLogRewritePlanSegment{ + {FileID: 11, BytesTotal: 100, BytesLive: 25, BytesStale: 75}, + {FileID: 22, BytesTotal: 120, BytesLive: 40, BytesStale: 80}, + }, + }, nil) + if got, want := db.vlogGenerationRewritePlanSelectedBytes.Load(), uint64(220); got != want { + t.Fatalf("fallback selected bytes total=%d want=%d", got, want) + } + if got, want := db.vlogGenerationRewritePlanSelectedLiveBytes.Load(), uint64(65); got != want { + t.Fatalf("fallback selected bytes live=%d want=%d", got, want) + } + if got, want := db.vlogGenerationRewritePlanSelectedStaleBytes.Load(), uint64(155); got != want { + t.Fatalf("fallback selected bytes stale=%d want=%d", got, want) + } +} + func TestVlogGenerationRewriteMinStaleRatioForGenericPass_UsesQualityFloor(t *testing.T) { db := &DB{valueLogRewriteTriggerRatioPPM: 200000} if got, want := db.vlogGenerationRewriteMinStaleRatioForGenericPass(8<<30), vlogGenerationRewriteGenericMinSegmentStaleRatio; got != want { From 9ce5339a4e3df1de4fbf51631cc64fa12c81941b Mon Sep 17 00:00:00 2001 From: Mikers Date: Fri, 27 Mar 2026 17:38:33 -1000 Subject: [PATCH 02/61] treedb: add maintenance and vacuum skip counters --- TreeDB/caching/db.go | 63 +++++++++++++++++++ .../caching/vlog_generation_scheduler_test.go | 56 +++++++++++++++++ worklog/2026-03-27.md | 53 ++++++++++++++++ 3 files changed, 172 insertions(+) create mode 100644 worklog/2026-03-27.md diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go index 6a1255668..1ee972687 100644 --- a/TreeDB/caching/db.go +++ b/TreeDB/caching/db.go @@ -5200,6 +5200,9 @@ type DB struct { vlogGenerationGCRuns atomic.Uint64 vlogGenerationVacuumRuns atomic.Uint64 vlogGenerationVacuumFailures atomic.Uint64 + vlogGenerationVacuumSkippedDisabled atomic.Uint64 + vlogGenerationVacuumSkippedRewriteBytes atomic.Uint64 + vlogGenerationVacuumSkippedCooldown atomic.Uint64 vlogGenerationLastVacuumUnixNano atomic.Int64 vlogGenerationLastRewritePlanUnixNano atomic.Int64 vlogGenerationLastRewriteUnixNano atomic.Int64 @@ -5211,6 +5214,20 @@ type DB struct { vlogGenerationChurnBytes atomic.Uint64 vlogGenerationSchedulerState atomic.Uint32 vlogGenerationMaintenanceActive atomic.Bool + vlogGenerationMaintenanceAttempts atomic.Uint64 + vlogGenerationMaintenanceAcquired atomic.Uint64 + vlogGenerationMaintenanceCollisions atomic.Uint64 + vlogGenerationMaintenanceSkipWALOnPeriodic atomic.Uint64 + vlogGenerationMaintenanceSkipPhase atomic.Uint64 + vlogGenerationMaintenanceSkipStageGate atomic.Uint64 + vlogGenerationMaintenanceSkipAgeBlocked atomic.Uint64 + vlogGenerationMaintenanceSkipPriority atomic.Uint64 + vlogGenerationMaintenanceSkipQuiet atomic.Uint64 + vlogGenerationMaintenanceSkipPreCheckpoint atomic.Uint64 + vlogGenerationMaintenanceSkipCheckpointing atomic.Uint64 + vlogGenerationMaintenancePassNoop atomic.Uint64 + vlogGenerationMaintenancePassWithRewrite atomic.Uint64 + vlogGenerationMaintenancePassWithGC atomic.Uint64 vlogGenerationLastReason atomic.Uint32 vlogGenerationCheckpointKickRuns atomic.Uint64 vlogGenerationCheckpointKickRewriteRuns atomic.Uint64 @@ -13016,14 +13033,17 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog if db == nil || db.closing.Load() || db.valueLogGenerationPolicy != uint8(backenddb.ValueLogGenerationHotWarmCold) { return } + db.vlogGenerationMaintenanceAttempts.Add(1) // In WAL-on mode, the periodic "runGC" tick must not enter the maintenance // engine at all. Checkpoint-coupled work belongs to the explicit // checkpoint-kick/deferred paths; letting the periodic GC tick even acquire // maintenanceActive can strand that slot behind hot restore-time locks. if runGC && !db.disableJournal && !opts.bypassQuiet { + db.vlogGenerationMaintenanceSkipWALOnPeriodic.Add(1) return } if db.suppressBackgroundVlogGenerationForMaintenancePhase() { + db.vlogGenerationMaintenanceSkipPhase.Add(1) if opts.debugSource != "" { db.debugVlogMaintf( "maintenance_skip reason=maintenance_phase source=%s phase=%s checkpoint_pending=%t deferred_pending=%t", @@ -13039,6 +13059,7 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog // checkpoint-kick path can race otherwise, which causes overlapping rewrite // runs to compete on the same resume queue. if !db.vlogGenerationMaintenanceActive.CompareAndSwap(false, true) { + db.vlogGenerationMaintenanceCollisions.Add(1) // Checkpoint-kick retries are high-priority and quiet-window-bypassed by // design. If they collide with an active pass, queue exactly one retry to // run right after the active pass exits. @@ -13057,6 +13078,9 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog return } acquired = true + db.vlogGenerationMaintenanceAcquired.Add(1) + rewriteRunsBefore := db.vlogGenerationRewriteRuns.Load() + gcRunsBefore := db.vlogGenerationGCRuns.Load() activeSource := vlogGenerationMaintenanceDebugSource(opts) activeStart := time.Now() db.debugVlogMaintf( @@ -13082,6 +13106,17 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog // the original retry goroutine to still be alive. db.scheduleDueVlogGenerationDeferredMaintenance() db.schedulePendingVlogGenerationCheckpointKick() + rewriteRan := db.vlogGenerationRewriteRuns.Load() > rewriteRunsBefore + gcRan := db.vlogGenerationGCRuns.Load() > gcRunsBefore + if rewriteRan { + db.vlogGenerationMaintenancePassWithRewrite.Add(1) + } + if gcRan { + db.vlogGenerationMaintenancePassWithGC.Add(1) + } + if !rewriteRan && !gcRan { + db.vlogGenerationMaintenancePassNoop.Add(1) + } }() now := time.Now() quiet := db.foregroundActivityQuietFor(now, vlogGenerationMaintenanceQuietWindow, vlogForegroundReadQuietWindow) @@ -13124,16 +13159,19 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog // passes spend long maintenance windows before the confirmation delay // has elapsed. The only valid next step is to wait for confirmation. if !vlogGenerationIsStageConfirmSource(opts) { + db.vlogGenerationMaintenanceSkipStageGate.Add(1) return } } else if !vlogGenerationIsStageConfirmSource(opts) { // When confirmation becomes due, reserve the maintenance slot for the // explicit stage-confirm wake instead of letting generic retries or // periodic passes reacquire it first. + db.vlogGenerationMaintenanceSkipStageGate.Add(1) return } } if !stagePending && ageBlockedDue && !vlogGenerationIsAgeBlockedSource(opts) { + db.vlogGenerationMaintenanceSkipAgeBlocked.Add(1) return } // Checkpoint-collision retries and timer-driven confirmation wakes should run @@ -13141,11 +13179,13 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog // collisions where periodic maintenance keeps reacquiring the scheduler while // the higher-priority retry is still trying to run. if !opts.bypassQuiet && (db.vlogGenerationCheckpointKickPending.Load() || db.vlogGenerationDeferredMaintenancePending.Load()) { + db.vlogGenerationMaintenanceSkipPriority.Add(1) return } // Explicit GC runs bypass the foreground quiet-window gate so callers can // force a safety/cleanup pass even while foreground activity is ongoing. if !runGC && !opts.bypassQuiet && !quiet { + db.vlogGenerationMaintenanceSkipQuiet.Add(1) return } // In WAL-off mode, do not start rewrite/GC planning before the first @@ -13156,6 +13196,7 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog // before the first checkpoint; starving that path causes the main value-log // lane to grow unchecked during restore. if db.disableJournal && db.checkpointRuns.Load() == 0 && !runGC && len(rewriteQueue) == 0 && !opts.skipCheckpoint { + db.vlogGenerationMaintenanceSkipPreCheckpoint.Add(1) return } // Retained-prune and generation maintenance use the same foreground quiet-window gate. @@ -13176,10 +13217,12 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog return } } else { + db.vlogGenerationMaintenanceSkipCheckpointing.Add(1) return } } if db.checkpointing.Load() { + db.vlogGenerationMaintenanceSkipCheckpointing.Add(1) return } now = time.Now() @@ -14144,6 +14187,7 @@ func (db *DB) maybeRunVlogGenerationIndexVacuum(rewriteBytesIn int64) { return } if envBool(envDisableVlogGenerationVacuum) { + db.vlogGenerationVacuumSkippedDisabled.Add(1) return } vacuumer, ok := db.backend.(backendIndexVacuumer) @@ -14186,12 +14230,14 @@ func (db *DB) shouldRunVlogGenerationIndexVacuum(rewriteBytesIn int64, now time. return false } if rewriteBytesIn < vlogGenerationVacuumTriggerRewriteBytes { + db.vlogGenerationVacuumSkippedRewriteBytes.Add(1) return false } last := db.vlogGenerationLastVacuumUnixNano.Load() if last > 0 { lastAt := time.Unix(0, last) if now.Sub(lastAt) < vlogGenerationVacuumMinInterval { + db.vlogGenerationVacuumSkippedCooldown.Add(1) return false } } @@ -19491,6 +19537,20 @@ func (db *DB) Stats() map[string]string { stats["treedb.cache.vlog_generation.checkpoint_kick.runs"] = fmt.Sprintf("%d", db.vlogGenerationCheckpointKickRuns.Load()) stats["treedb.cache.vlog_generation.checkpoint_kick.rewrite_runs"] = fmt.Sprintf("%d", db.vlogGenerationCheckpointKickRewriteRuns.Load()) stats["treedb.cache.vlog_generation.checkpoint_kick.gc_runs"] = fmt.Sprintf("%d", db.vlogGenerationCheckpointKickGCRuns.Load()) + stats["treedb.cache.vlog_generation.maintenance.attempts"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceAttempts.Load()) + stats["treedb.cache.vlog_generation.maintenance.acquired"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceAcquired.Load()) + stats["treedb.cache.vlog_generation.maintenance.collisions"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceCollisions.Load()) + stats["treedb.cache.vlog_generation.maintenance.skip.wal_on_periodic"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceSkipWALOnPeriodic.Load()) + stats["treedb.cache.vlog_generation.maintenance.skip.maintenance_phase"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceSkipPhase.Load()) + stats["treedb.cache.vlog_generation.maintenance.skip.stage_gate"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceSkipStageGate.Load()) + stats["treedb.cache.vlog_generation.maintenance.skip.age_blocked_gate"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceSkipAgeBlocked.Load()) + stats["treedb.cache.vlog_generation.maintenance.skip.priority_pending"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceSkipPriority.Load()) + stats["treedb.cache.vlog_generation.maintenance.skip.quiet_window"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceSkipQuiet.Load()) + stats["treedb.cache.vlog_generation.maintenance.skip.before_first_checkpoint"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceSkipPreCheckpoint.Load()) + stats["treedb.cache.vlog_generation.maintenance.skip.checkpoint_inflight"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceSkipCheckpointing.Load()) + stats["treedb.cache.vlog_generation.maintenance.passes.noop"] = fmt.Sprintf("%d", db.vlogGenerationMaintenancePassNoop.Load()) + stats["treedb.cache.vlog_generation.maintenance.passes.with_rewrite"] = fmt.Sprintf("%d", db.vlogGenerationMaintenancePassWithRewrite.Load()) + stats["treedb.cache.vlog_generation.maintenance.passes.with_gc"] = fmt.Sprintf("%d", db.vlogGenerationMaintenancePassWithGC.Load()) stats["treedb.cache.vlog_generation.churn_bytes_total"] = fmt.Sprintf("%d", db.vlogGenerationChurnBytes.Load()) stats["treedb.cache.vlog_generation.churn_bytes_per_sec"] = fmt.Sprintf("%d", db.vlogGenerationLastChurnBps.Load()) stats["treedb.cache.vlog_generation.rewrite.queue_len"] = fmt.Sprintf("%d", rewriteQueueLen) @@ -19554,6 +19614,9 @@ func (db *DB) Stats() map[string]string { stats["treedb.cache.vlog_generation.gc.dry_run.last_eligible_segments"] = fmt.Sprintf("%d", db.vlogGenerationLastGCDryRunSegsEligible.Load()) stats["treedb.cache.vlog_generation.vacuum.runs"] = fmt.Sprintf("%d", db.vlogGenerationVacuumRuns.Load()) stats["treedb.cache.vlog_generation.vacuum.failures"] = fmt.Sprintf("%d", db.vlogGenerationVacuumFailures.Load()) + stats["treedb.cache.vlog_generation.vacuum.skipped_disabled"] = fmt.Sprintf("%d", db.vlogGenerationVacuumSkippedDisabled.Load()) + stats["treedb.cache.vlog_generation.vacuum.skipped_rewrite_bytes"] = fmt.Sprintf("%d", db.vlogGenerationVacuumSkippedRewriteBytes.Load()) + stats["treedb.cache.vlog_generation.vacuum.skipped_cooldown"] = fmt.Sprintf("%d", db.vlogGenerationVacuumSkippedCooldown.Load()) stats["treedb.cache.vlog_generation.vacuum.last_unix_nano"] = fmt.Sprintf("%d", db.vlogGenerationLastVacuumUnixNano.Load()) stats["treedb.cache.vlog_generation.remap.successes"] = fmt.Sprintf("%d", db.vlogGenerationRemapSuccesses.Load()) stats["treedb.cache.vlog_generation.remap.failures"] = fmt.Sprintf("%d", db.vlogGenerationRemapFailures.Load()) diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go index 154ed646d..8211404bb 100644 --- a/TreeDB/caching/vlog_generation_scheduler_test.go +++ b/TreeDB/caching/vlog_generation_scheduler_test.go @@ -320,6 +320,62 @@ func TestObserveVlogGenerationRewritePlanOutcome_SelectedTracksSegmentFallbackBy } } +func TestMaybeRunVlogGenerationMaintenanceWithOptions_TracksWalOnPeriodicSkip(t *testing.T) { + db := &DB{valueLogGenerationPolicy: uint8(backenddb.ValueLogGenerationHotWarmCold)} + db.maybeRunVlogGenerationMaintenanceWithOptions(true, vlogGenerationMaintenanceOptions{}) + if got, want := db.vlogGenerationMaintenanceAttempts.Load(), uint64(1); got != want { + t.Fatalf("maintenance attempts=%d want=%d", got, want) + } + if got, want := db.vlogGenerationMaintenanceSkipWALOnPeriodic.Load(), uint64(1); got != want { + t.Fatalf("maintenance wal-on periodic skips=%d want=%d", got, want) + } + if got := db.vlogGenerationMaintenanceAcquired.Load(); got != 0 { + t.Fatalf("maintenance acquired=%d want=0", got) + } +} + +func TestMaybeRunVlogGenerationMaintenanceWithOptions_TracksCollision(t *testing.T) { + db := &DB{valueLogGenerationPolicy: uint8(backenddb.ValueLogGenerationHotWarmCold)} + db.vlogGenerationMaintenanceActive.Store(true) + db.maybeRunVlogGenerationMaintenanceWithOptions(false, vlogGenerationMaintenanceOptions{}) + if got, want := db.vlogGenerationMaintenanceAttempts.Load(), uint64(1); got != want { + t.Fatalf("maintenance attempts=%d want=%d", got, want) + } + if got, want := db.vlogGenerationMaintenanceCollisions.Load(), uint64(1); got != want { + t.Fatalf("maintenance collisions=%d want=%d", got, want) + } + if got := db.vlogGenerationMaintenanceAcquired.Load(); got != 0 { + t.Fatalf("maintenance acquired=%d want=0", got) + } +} + +func TestShouldRunVlogGenerationIndexVacuum_TracksSkipReasons(t *testing.T) { + db := &DB{valueLogGenerationPolicy: uint8(backenddb.ValueLogGenerationHotWarmCold)} + now := time.Now() + if db.shouldRunVlogGenerationIndexVacuum(vlogGenerationVacuumTriggerRewriteBytes-1, now) { + t.Fatalf("expected vacuum to skip below rewrite trigger") + } + if got, want := db.vlogGenerationVacuumSkippedRewriteBytes.Load(), uint64(1); got != want { + t.Fatalf("vacuum skipped_rewrite_bytes=%d want=%d", got, want) + } + db.vlogGenerationLastVacuumUnixNano.Store(now.UnixNano()) + if db.shouldRunVlogGenerationIndexVacuum(vlogGenerationVacuumTriggerRewriteBytes, now) { + t.Fatalf("expected vacuum to skip during cooldown") + } + if got, want := db.vlogGenerationVacuumSkippedCooldown.Load(), uint64(1); got != want { + t.Fatalf("vacuum skipped_cooldown=%d want=%d", got, want) + } +} + +func TestMaybeRunVlogGenerationIndexVacuum_TracksDisabledSkip(t *testing.T) { + t.Setenv(envDisableVlogGenerationVacuum, "1") + db := &DB{valueLogGenerationPolicy: uint8(backenddb.ValueLogGenerationHotWarmCold)} + db.maybeRunVlogGenerationIndexVacuum(vlogGenerationVacuumTriggerRewriteBytes) + if got, want := db.vlogGenerationVacuumSkippedDisabled.Load(), uint64(1); got != want { + t.Fatalf("vacuum skipped_disabled=%d want=%d", got, want) + } +} + func TestVlogGenerationRewriteMinStaleRatioForGenericPass_UsesQualityFloor(t *testing.T) { db := &DB{valueLogRewriteTriggerRatioPPM: 200000} if got, want := db.vlogGenerationRewriteMinStaleRatioForGenericPass(8<<30), vlogGenerationRewriteGenericMinSegmentStaleRatio; got != want { diff --git a/worklog/2026-03-27.md b/worklog/2026-03-27.md new file mode 100644 index 000000000..5d0fa2ff5 --- /dev/null +++ b/worklog/2026-03-27.md @@ -0,0 +1,53 @@ +# Work Log - 2026-03-27 + +- Added live value-log maintenance observability for `run_celestia` investigation: + - `TreeDB/caching/db.go` + - maintenance gate counters: + - `treedb.cache.vlog_generation.maintenance.attempts` + - `treedb.cache.vlog_generation.maintenance.acquired` + - `treedb.cache.vlog_generation.maintenance.collisions` + - `treedb.cache.vlog_generation.maintenance.skip.*` + - `treedb.cache.vlog_generation.maintenance.passes.{noop,with_rewrite,with_gc}` + - vacuum skip counters: + - `treedb.cache.vlog_generation.vacuum.skipped_disabled` + - `treedb.cache.vlog_generation.vacuum.skipped_rewrite_bytes` + - `treedb.cache.vlog_generation.vacuum.skipped_cooldown` + - Added/updated tests in `TreeDB/caching/vlog_generation_scheduler_test.go` for: + - WAL-on periodic skip accounting + - maintenance collision accounting + - vacuum skip-reason accounting + +- Validation: + - `go test ./TreeDB/caching -count=1` + +- `run_celestia` validation run (fast profile, local gomap override): + - command: + - `LOCAL_GOMAP_DIR=/home/mikers/dev/snissn/gomap-phasehook-active TREEDB_OPEN_PROFILE=fast STOP_AT_LOCAL_HEIGHT=500 FREEZE_REMOTE_HEIGHT_AT_START=1 NO_PROGRESS_WARN_SECONDS=120 NO_PROGRESS_FAIL_SECONDS=1800 HEAP_CAPTURE_RSS_DELTA_KB=1 CAPTURE_HEAP_ON_MAX_RSS=1 ~/run_celestia.sh` + - run home: + - `/home/mikers/.celestia-app-mainnet-treedb-20260327173138` + - duration / stop condition: + - `duration_seconds=277` + - local-height stop target hit at `local=10413004` + - disk: + - `end_app_bytes=5011158649` + - `disk-breakdown.log` shows dominant `maindb/wal/value-l0-*` files (~256MiB each) + +- Latest diagnostics snapshot used for maintenance counters: + - file: + - `/home/mikers/.celestia-app-mainnet-treedb-20260327173138/sync/diagnostics/pprof-heap-max-rss-7070580k-20260327173541.treedb_vars.json` + - key counters: + - `maintenance.attempts=637` + - `maintenance.acquired=56` + - `maintenance.collisions=581` + - `maintenance.passes.noop=53` + - `maintenance.passes.with_gc=2` + - `maintenance.passes.with_rewrite=0` + - `maintenance.skip.quiet_window=26` + - `gc.runs=2` + - `rewrite.runs=0` + - `vacuum.runs=0` + - `scheduler_last_reason=periodic_gc` + +- Interpretation (for next slice): + - During this early state-sync window, rewrite did not trigger and therefore vacuum never became eligible on the post-rewrite path. + - The dominant scheduler behavior was active-pass contention (`collisions=581`) and noop acquired passes; this is now directly measurable. From bf11ec9d19a745e83d8dafd007b5f443f518da8a Mon Sep 17 00:00:00 2001 From: Mikers Date: Fri, 27 Mar 2026 17:49:26 -1000 Subject: [PATCH 03/61] treedb: coalesce maintenance retries under load --- TreeDB/caching/db.go | 33 +++++++++++++++++++ .../caching/vlog_generation_scheduler_test.go | 30 +++++++++++++++++ worklog/2026-03-27.md | 32 ++++++++++++++++++ 3 files changed, 95 insertions(+) diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go index 1ee972687..438019ec9 100644 --- a/TreeDB/caching/db.go +++ b/TreeDB/caching/db.go @@ -12188,6 +12188,9 @@ func (db *DB) maybeRunPeriodicVlogGenerationMaintenance(runGC bool) bool { if db == nil { return false } + if db.vlogGenerationMaintenanceActive.Load() { + return false + } if db.suppressBackgroundVlogGenerationForMaintenancePhase() { db.debugVlogMaintf("periodic_skip reason=maintenance_phase phase=%s run_gc=%t", maintenancePhaseString(uint32(db.MaintenancePhase())), runGC) return false @@ -12968,6 +12971,36 @@ func (db *DB) runVlogGenerationMaintenanceRetries(opts vlogGenerationMaintenance deadline := time.Now().Add(retryWindow) sleepDelay := 10 * time.Millisecond for !db.closing.Load() { + // Once retry intent is already queued, avoid repeatedly colliding with + // the active maintenance pass; wait for release or deadline instead. + if db.vlogGenerationMaintenanceActive.Load() { + if stopWhenAcquired && db.vlogGenerationDeferredMaintenancePending.Load() { + if time.Now().After(deadline) { + return + } + time.Sleep(sleepDelay) + if sleepDelay < 100*time.Millisecond { + sleepDelay *= 2 + if sleepDelay > 100*time.Millisecond { + sleepDelay = 100 * time.Millisecond + } + } + continue + } + if !stopWhenAcquired && db.vlogGenerationCheckpointKickPending.Load() { + if time.Now().After(deadline) { + return + } + time.Sleep(sleepDelay) + if sleepDelay < 100*time.Millisecond { + sleepDelay *= 2 + if sleepDelay > 100*time.Millisecond { + sleepDelay = 100 * time.Millisecond + } + } + continue + } + } attempt++ if opts.debugSource != "" { db.debugVlogMaintf( diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go index 8211404bb..76eb3f288 100644 --- a/TreeDB/caching/vlog_generation_scheduler_test.go +++ b/TreeDB/caching/vlog_generation_scheduler_test.go @@ -376,6 +376,36 @@ func TestMaybeRunVlogGenerationIndexVacuum_TracksDisabledSkip(t *testing.T) { } } +func TestRunVlogGenerationMaintenanceRetries_CoalescesPendingCollisionRetries(t *testing.T) { + db := &DB{valueLogGenerationPolicy: uint8(backenddb.ValueLogGenerationHotWarmCold)} + + db.vlogGenerationMaintenanceActive.Store(true) + db.vlogGenerationCheckpointKickPending.Store(true) + db.runVlogGenerationMaintenanceRetries(vlogGenerationMaintenanceOptions{ + bypassQuiet: true, + skipRetainedPruneWait: true, + skipCheckpoint: false, + rewriteDebtDrain: true, + debugSource: "checkpoint_pending", + }, 30*time.Millisecond, false) + if got := db.vlogGenerationMaintenanceCollisions.Load(); got != 0 { + t.Fatalf("checkpoint pending retry collisions=%d want=0", got) + } + + db.vlogGenerationMaintenanceActive.Store(true) + db.vlogGenerationDeferredMaintenancePending.Store(true) + db.runVlogGenerationMaintenanceRetries(vlogGenerationMaintenanceOptions{ + bypassQuiet: true, + skipRetainedPruneWait: true, + skipCheckpoint: false, + rewriteDebtDrain: true, + debugSource: "rewrite_stage_confirm", + }, 30*time.Millisecond, true) + if got := db.vlogGenerationMaintenanceCollisions.Load(); got != 0 { + t.Fatalf("deferred pending retry collisions=%d want=0", got) + } +} + func TestVlogGenerationRewriteMinStaleRatioForGenericPass_UsesQualityFloor(t *testing.T) { db := &DB{valueLogRewriteTriggerRatioPPM: 200000} if got, want := db.vlogGenerationRewriteMinStaleRatioForGenericPass(8<<30), vlogGenerationRewriteGenericMinSegmentStaleRatio; got != want { diff --git a/worklog/2026-03-27.md b/worklog/2026-03-27.md index 5d0fa2ff5..586eab726 100644 --- a/worklog/2026-03-27.md +++ b/worklog/2026-03-27.md @@ -51,3 +51,35 @@ - Interpretation (for next slice): - During this early state-sync window, rewrite did not trigger and therefore vacuum never became eligible on the post-rewrite path. - The dominant scheduler behavior was active-pass contention (`collisions=581`) and noop acquired passes; this is now directly measurable. + +- Collision-coalescing follow-up (same day): + - code changes: + - `TreeDB/caching/db.go` + - `runVlogGenerationMaintenanceRetries`: when retry intent is already pending and `maintenanceActive` is true, wait/backoff instead of re-entering `maybeRun...` and creating repeated collisions. + - `maybeRunPeriodicVlogGenerationMaintenance`: skip periodic entry while `maintenanceActive` is true. + - `TreeDB/caching/vlog_generation_scheduler_test.go` + - added `TestRunVlogGenerationMaintenanceRetries_CoalescesPendingCollisionRetries`. + - validation: + - `go test ./TreeDB/caching -count=1` + +- `run_celestia` comparison rerun after collision-coalescing: + - command (same as prior comparison run): + - `LOCAL_GOMAP_DIR=/home/mikers/dev/snissn/gomap-phasehook-active TREEDB_OPEN_PROFILE=fast STOP_AT_LOCAL_HEIGHT=500 FREEZE_REMOTE_HEIGHT_AT_START=1 NO_PROGRESS_WARN_SECONDS=120 NO_PROGRESS_FAIL_SECONDS=1800 HEAP_CAPTURE_RSS_DELTA_KB=1 CAPTURE_HEAP_ON_MAX_RSS=1 ~/run_celestia.sh` + - run home: + - `/home/mikers/.celestia-app-mainnet-treedb-20260327174335` + - sampled snapshot: + - `pprof-heap-max-rss-6605092k-20260327174744.treedb_vars.json` + +- Baseline vs new counter delta (same metric keys/sampling style): + - baseline snapshot: `pprof-heap-max-rss-7070580k-20260327173541.treedb_vars.json` + - `maintenance.attempts`: `637 -> 124` + - `maintenance.acquired`: `56 -> 119` + - `maintenance.collisions`: `581 -> 5` + - `maintenance.passes.noop`: `53 -> 116` + - `maintenance.passes.with_rewrite`: `0 -> 1` + - `maintenance.passes.with_gc`: `2 -> 1` + - `rewrite.runs`: `0 -> 1` + - `vacuum.runs`: `0 -> 1` + +- Interpretation: + - Coalescing retry loops materially reduced collision churn and allowed at least one rewrite+vacuum pass to complete in the same early-state-sync lab window. From c83e2d6164ef50699c194048f5e6bae0e39a220c Mon Sep 17 00:00:00 2001 From: Mikers Date: Fri, 27 Mar 2026 17:59:27 -1000 Subject: [PATCH 04/61] treedb: skip hot periodic maintenance preflight --- TreeDB/caching/db.go | 13 +++++++ .../caching/vlog_generation_scheduler_test.go | 34 +++++++++++++++++++ worklog/2026-03-27.md | 34 +++++++++++++++++++ 3 files changed, 81 insertions(+) diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go index 438019ec9..5671e2383 100644 --- a/TreeDB/caching/db.go +++ b/TreeDB/caching/db.go @@ -12195,6 +12195,19 @@ func (db *DB) maybeRunPeriodicVlogGenerationMaintenance(runGC bool) bool { db.debugVlogMaintf("periodic_skip reason=maintenance_phase phase=%s run_gc=%t", maintenancePhaseString(uint32(db.MaintenancePhase())), runGC) return false } + // Coarse preflight: while foreground activity is hot, avoid entering the + // maintenance engine unless a deferred/checkpoint wake is pending. This + // prevents high-frequency periodic no-op acquisitions. + if !runGC { + now := time.Now() + quiet := db.foregroundActivityQuietFor(now, vlogGenerationMaintenanceQuietWindow, vlogForegroundReadQuietWindow) + if !quiet && + !db.vlogGenerationCheckpointKickPending.Load() && + !db.vlogGenerationDeferredMaintenancePending.Load() && + !db.vlogGenerationDeferredMaintenanceDue(now) { + return false + } + } db.maybeRunVlogGenerationMaintenance(runGC) return true } diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go index 76eb3f288..1de7c37bb 100644 --- a/TreeDB/caching/vlog_generation_scheduler_test.go +++ b/TreeDB/caching/vlog_generation_scheduler_test.go @@ -4112,6 +4112,40 @@ func TestVlogGenerationMaintenance_PeriodicSkipsWhenMaintenancePhaseNonSteady(t } } +func TestVlogGenerationMaintenance_PeriodicPreflightSkipsHotNoPending(t *testing.T) { + prepareDirectSchedulerTest(t) + + dir := t.TempDir() + backend, err := backenddb.Open(backenddb.Options{Dir: dir}) + if err != nil { + t.Fatalf("open backend: %v", err) + } + recorder := &rewriteBudgetRecordingBackend{ + DB: backend, + rewriteResponse: backenddb.ValueLogRewriteStats{BytesBefore: 64, BytesAfter: 32, RecordsCopied: 1}, + } + + db, cleanup := openRewriteQueueTestDB(t, dir, recorder) + t.Cleanup(cleanup) + skipRetainedPrune(db) + + hot := time.Now().UnixNano() + db.lastForegroundWriteUnixNano.Store(hot) + db.lastForegroundReadUnixNano.Store(hot) + db.vlogGenerationCheckpointKickPending.Store(false) + db.vlogGenerationDeferredMaintenancePending.Store(false) + + if ran := db.maybeRunPeriodicVlogGenerationMaintenance(false); ran { + t.Fatal("periodic maintenance unexpectedly entered during hot foreground with no pending wake") + } + if got := db.vlogGenerationMaintenanceAttempts.Load(); got != 0 { + t.Fatalf("maintenance attempts=%d want 0 on preflight skip", got) + } + if _, calls := recorder.recordedRewrite(); calls != 0 { + t.Fatalf("rewrite calls=%d want 0 on preflight skip", calls) + } +} + func TestCheckpoint_KickSkipsWhenMaintenancePhaseNonSteady(t *testing.T) { disableVlogGenerationLoop(t) diff --git a/worklog/2026-03-27.md b/worklog/2026-03-27.md index 586eab726..b41f022dd 100644 --- a/worklog/2026-03-27.md +++ b/worklog/2026-03-27.md @@ -83,3 +83,37 @@ - Interpretation: - Coalescing retry loops materially reduced collision churn and allowed at least one rewrite+vacuum pass to complete in the same early-state-sync lab window. + +- Periodic preflight follow-up: + - code changes: + - `TreeDB/caching/db.go` + - `maybeRunPeriodicVlogGenerationMaintenance`: added hot-foreground preflight (when not `runGC`) to skip entering maintenance unless deferred/checkpoint wake is pending. + - `TreeDB/caching/vlog_generation_scheduler_test.go` + - added `TestVlogGenerationMaintenance_PeriodicPreflightSkipsHotNoPending`. + - validation: + - `go test ./TreeDB/caching -count=1` + +- Third `run_celestia` comparison run (same command profile): + - run home: + - `/home/mikers/.celestia-app-mainnet-treedb-20260327175340` + - sampled snapshot: + - `pprof-heap-max-rss-6560568k-20260327175747.treedb_vars.json` + - key counters: + - `maintenance.attempts=46` + - `maintenance.acquired=41` + - `maintenance.collisions=5` + - `maintenance.passes.noop=38` + - `maintenance.passes.with_rewrite=1` + - `maintenance.passes.with_gc=1` + - `maintenance.skip.quiet_window=0` + - `rewrite.runs=1` + - `vacuum.runs=1` + +- Multi-run trend (same lab recipe): + - baseline snapshot: `attempts=637`, `collisions=581`, `rewrite=0`, `vacuum=0` + - collision-coalesced snapshot: `attempts=124`, `collisions=5`, `rewrite=1`, `vacuum=1` + - preflight snapshot: `attempts=46`, `collisions=5`, `rewrite=1`, `vacuum=1` + +- Interpretation: + - Retry coalescing delivered the major contention reduction. + - Periodic preflight further reduced maintenance churn/noop entries while preserving rewrite+vacuum progress in this early-state-sync window. From 31fbb0a53e9570c514dd09510370cb8f78ca0b27 Mon Sep 17 00:00:00 2001 From: Mikers Date: Fri, 27 Mar 2026 18:17:42 -1000 Subject: [PATCH 05/61] treedb: add live vacuum/rewrite economics instrumentation --- TreeDB/caching/db.go | 202 +++++++++++++++++- .../caching/vlog_generation_scheduler_test.go | 121 +++++++++++ worklog/2026-03-27.md | 28 +++ 3 files changed, 342 insertions(+), 9 deletions(-) diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go index 5671e2383..20c877be7 100644 --- a/TreeDB/caching/db.go +++ b/TreeDB/caching/db.go @@ -5228,6 +5228,8 @@ type DB struct { vlogGenerationMaintenancePassNoop atomic.Uint64 vlogGenerationMaintenancePassWithRewrite atomic.Uint64 vlogGenerationMaintenancePassWithGC atomic.Uint64 + vlogGenerationMaintenancePassTotalNanos atomic.Uint64 + vlogGenerationMaintenancePassMaxNanos atomic.Uint64 vlogGenerationLastReason atomic.Uint32 vlogGenerationCheckpointKickRuns atomic.Uint64 vlogGenerationCheckpointKickRewriteRuns atomic.Uint64 @@ -5252,6 +5254,15 @@ type DB struct { // budget while still running maintenance at coarse intervals. vlogGenerationRewriteBudgetLastUnixNano atomic.Int64 vlogGenerationRewriteBudgetTokensBytes atomic.Int64 + vlogGenerationRewriteBudgetConsumed atomic.Uint64 + vlogGenerationRewritePlanTotalNanos atomic.Uint64 + vlogGenerationRewritePlanMaxNanos atomic.Uint64 + vlogGenerationRewriteExecTotalNanos atomic.Uint64 + vlogGenerationRewriteExecMaxNanos atomic.Uint64 + vlogGenerationGCExecTotalNanos atomic.Uint64 + vlogGenerationGCExecMaxNanos atomic.Uint64 + vlogGenerationVacuumExecTotalNanos atomic.Uint64 + vlogGenerationVacuumExecMaxNanos atomic.Uint64 bgErrMu sync.Mutex bgErr error @@ -12404,6 +12415,9 @@ func (db *DB) vlogGenerationConsumeRewriteBudgetBytes(n int64) { next = 0 } if db.vlogGenerationRewriteBudgetTokensBytes.CompareAndSwap(cur, next) { + if consumed := cur - next; consumed > 0 { + db.vlogGenerationRewriteBudgetConsumed.Add(uint64(consumed)) + } return } } @@ -12438,6 +12452,50 @@ func sumVlogRewritePlanLiveBytes(segments []backenddb.ValueLogRewritePlanSegment return sum, ok } +func observeDurationNanos(total, max *atomic.Uint64, d time.Duration) { + if total == nil || max == nil || d <= 0 { + return + } + n := uint64(d) + total.Add(n) + updateAtomicMaxUint64(max, n) +} + +func (db *DB) observeVlogGenerationMaintenancePassDuration(d time.Duration) { + if db == nil { + return + } + observeDurationNanos(&db.vlogGenerationMaintenancePassTotalNanos, &db.vlogGenerationMaintenancePassMaxNanos, d) +} + +func (db *DB) observeVlogGenerationRewritePlanDuration(d time.Duration) { + if db == nil { + return + } + observeDurationNanos(&db.vlogGenerationRewritePlanTotalNanos, &db.vlogGenerationRewritePlanMaxNanos, d) +} + +func (db *DB) observeVlogGenerationRewriteExecDuration(d time.Duration) { + if db == nil { + return + } + observeDurationNanos(&db.vlogGenerationRewriteExecTotalNanos, &db.vlogGenerationRewriteExecMaxNanos, d) +} + +func (db *DB) observeVlogGenerationGCExecDuration(d time.Duration) { + if db == nil { + return + } + observeDurationNanos(&db.vlogGenerationGCExecTotalNanos, &db.vlogGenerationGCExecMaxNanos, d) +} + +func (db *DB) observeVlogGenerationVacuumExecDuration(d time.Duration) { + if db == nil { + return + } + observeDurationNanos(&db.vlogGenerationVacuumExecTotalNanos, &db.vlogGenerationVacuumExecMaxNanos, d) +} + func vlogGenerationRewriteLedgerIDs(segments []backenddb.ValueLogRewritePlanSegment) []uint32 { if len(segments) == 0 { return nil @@ -12453,9 +12511,14 @@ func vlogGenerationRewriteLedgerIDs(segments []backenddb.ValueLogRewritePlanSegm } func (db *DB) observeVlogGenerationRewritePlanOutcome(plan backenddb.ValueLogRewritePlan, err error) { + db.observeVlogGenerationRewritePlanOutcomeWithDuration(plan, err, 0) +} + +func (db *DB) observeVlogGenerationRewritePlanOutcomeWithDuration(plan backenddb.ValueLogRewritePlan, err error, dur time.Duration) { if db == nil { return } + db.observeVlogGenerationRewritePlanDuration(dur) db.vlogGenerationRewritePlanRuns.Add(1) if err != nil { if isVlogGenerationPlannerCanceled(err) { @@ -13139,13 +13202,15 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog db.vlogGenerationDeferredMaintenancePending.Load(), ) defer func() { + passDur := time.Since(activeStart) db.debugVlogMaintf( "maintenance_active_release source=%s dur_ms=%d checkpoint_pending=%t deferred_pending=%t", activeSource, - time.Since(activeStart).Milliseconds(), + passDur.Milliseconds(), db.vlogGenerationCheckpointKickPending.Load(), db.vlogGenerationDeferredMaintenancePending.Load(), ) + db.observeVlogGenerationMaintenancePassDuration(passDur) db.vlogGenerationMaintenanceActive.Store(false) // If a deferred confirmation/age wake became due while this pass held the // scheduler active, requeue it immediately on exit instead of relying on @@ -13385,6 +13450,7 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog planStart := time.Now() plan, err := planner.ValueLogRewritePlan(ctx, planOpts) cancel() + planDur := time.Since(planStart) db.debugVlogMaintf( "rewrite_plan stale_ratio_trigger min_ratio=%.6f max_source_bytes=%d selected=%d/%d selected_bytes_total=%d selected_bytes_live=%d selected_bytes_stale=%d total_bytes=%d live_bytes=%d stale_bytes=%d dur_ms=%.3f err=%v", minStaleRatio, @@ -13397,10 +13463,10 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog plan.BytesTotal, plan.BytesLive, plan.BytesStale, - float64(time.Since(planStart).Microseconds())/1000, + float64(planDur.Microseconds())/1000, err, ) - db.observeVlogGenerationRewritePlanOutcome(plan, err) + db.observeVlogGenerationRewritePlanOutcomeWithDuration(plan, err, planDur) updatePlanTimestamp := false if err != nil { db.clearVlogGenerationRewriteAgeBlockedUntil() @@ -13537,6 +13603,7 @@ planned: MinSegmentAge: vlogGenerationRewriteMinSegmentAge, }) cancel() + planDur := time.Since(planStart) db.debugVlogMaintf( "rewrite_plan pre_rewrite max_source_bytes=%d min_ratio=%.6f min_stale_bytes=%d selected=%d/%d selected_bytes_total=%d selected_bytes_live=%d selected_bytes_stale=%d total_bytes=%d live_bytes=%d stale_bytes=%d dur_ms=%.3f err=%v", maxSourceBytes, @@ -13550,10 +13617,10 @@ planned: plan.BytesTotal, plan.BytesLive, plan.BytesStale, - float64(time.Since(planStart).Microseconds())/1000, + float64(planDur.Microseconds())/1000, err, ) - db.observeVlogGenerationRewritePlanOutcome(plan, err) + db.observeVlogGenerationRewritePlanOutcomeWithDuration(plan, err, planDur) if err != nil { db.clearVlogGenerationRewriteAgeBlockedUntil() if isVlogGenerationPlannerCanceled(err) { @@ -13826,8 +13893,10 @@ planned: rewriteStart := time.Now() stats, err := rewriter.ValueLogRewriteOnline(ctx, rewriteOpts) cancel() + rewriteDur := time.Since(rewriteStart) + db.observeVlogGenerationRewriteExecDuration(rewriteDur) if err != nil { - db.debugVlogMaintf("rewrite_err reason=%s err=%v dur_ms=%.3f", vlogGenerationReasonString(reason), err, float64(time.Since(rewriteStart).Microseconds())/1000) + db.debugVlogMaintf("rewrite_err reason=%s err=%v dur_ms=%.3f", vlogGenerationReasonString(reason), err, float64(rewriteDur.Microseconds())/1000) if errors.Is(err, context.Canceled) { db.observeVlogGenerationRewriteCanceled() if len(processedRewriteIDs) > 0 { @@ -13847,7 +13916,7 @@ planned: stats.BytesBefore, stats.BytesAfter, stats.RecordsCopied, - float64(time.Since(rewriteStart).Microseconds())/1000, + float64(rewriteDur.Microseconds())/1000, ) effectiveBytesBefore := int64(stats.BytesBefore) effectiveBytesAfter := int64(stats.BytesAfter) @@ -13864,8 +13933,10 @@ planned: ProtectedPaths: db.valueLogProtectedPaths(), }) gcCancel() + gcDur := time.Since(gcStart) + db.observeVlogGenerationGCExecDuration(gcDur) if gcErr != nil { - db.debugVlogMaintf("gc_after_rewrite_err reason=%s err=%v dur_ms=%.3f", vlogGenerationReasonString(reason), gcErr, float64(time.Since(gcStart).Microseconds())/1000) + db.debugVlogMaintf("gc_after_rewrite_err reason=%s err=%v dur_ms=%.3f", vlogGenerationReasonString(reason), gcErr, float64(gcDur.Microseconds())/1000) return fmt.Errorf("generational gc after rewrite: %w", gcErr) } if gcStats.BytesDeleted > 0 { @@ -13875,7 +13946,7 @@ planned: effectiveBytesAfter = 0 } } - db.debugVlogMaintf("gc_after_rewrite_done reason=%s dur_ms=%.3f", vlogGenerationReasonString(reason), float64(time.Since(gcStart).Microseconds())/1000) + db.debugVlogMaintf("gc_after_rewrite_done reason=%s dur_ms=%.3f", vlogGenerationReasonString(reason), float64(gcDur.Microseconds())/1000) } if effectiveBytesBefore > effectiveBytesAfter { db.vlogGenerationRewriteReclaimedBytes.Add(uint64(effectiveBytesBefore - effectiveBytesAfter)) @@ -14052,8 +14123,10 @@ planned: db.vlogGenerationLastGCUnixNano.Store(now.UnixNano()) ctx, cancel := db.foregroundMaintenanceContext(30 * time.Second) gcOpts := backenddb.ValueLogGCOptions{ProtectedPaths: db.valueLogProtectedPaths()} + gcStart := time.Now() gcStats, err := gcer.ValueLogGC(ctx, gcOpts) cancel() + db.observeVlogGenerationGCExecDuration(time.Since(gcStart)) if err != nil { return fmt.Errorf("generational gc: %w", err) } @@ -14253,11 +14326,13 @@ func (db *DB) maybeRunVlogGenerationIndexVacuum(rewriteBytesIn int64) { return err } var err error + vacuumStart := time.Now() if db.maintenanceActive.Load() { err = runVacuum() } else { err = db.runWithBackendMaintenance(runVacuum) } + db.observeVlogGenerationVacuumExecDuration(time.Since(vacuumStart)) if err != nil { db.vlogGenerationVacuumFailures.Add(1) db.vlogGenerationSchedulerState.Store(vlogGenerationSchedulerError) @@ -19568,7 +19643,63 @@ func (db *DB) Stats() map[string]string { db.vlogGenerationRewriteQueueMu.Lock() rewriteQueueLen := len(db.vlogGenerationRewriteQueue) rewriteQueueLoaded := db.vlogGenerationRewriteQueueLoaded + rewriteLedgerSegments := len(db.vlogGenerationRewriteLedger) + rewritePenaltiesActive := len(db.vlogGenerationRewritePenalties) + rewriteStagePending := db.vlogGenerationRewriteStagePending + rewriteStageObservedNS := db.vlogGenerationRewriteStageObservedUnixNano + rewriteLedgerBytesTotal := int64(0) + rewriteLedgerBytesLive := int64(0) + rewriteLedgerBytesStale := int64(0) + for i := range db.vlogGenerationRewriteLedger { + seg := db.vlogGenerationRewriteLedger[i] + if seg.BytesTotal > 0 { + rewriteLedgerBytesTotal += seg.BytesTotal + } + if seg.BytesLive > 0 { + rewriteLedgerBytesLive += seg.BytesLive + } + if seg.BytesStale > 0 { + rewriteLedgerBytesStale += seg.BytesStale + } + } db.vlogGenerationRewriteQueueMu.Unlock() + rewriteAgeBlockedUntilNS := db.vlogGenerationRewriteAgeBlockedUntilNS.Load() + rewriteAgeBlockedRemainingMS := int64(0) + if rewriteAgeBlockedUntilNS > 0 { + if d := time.Until(time.Unix(0, rewriteAgeBlockedUntilNS)); d > 0 { + rewriteAgeBlockedRemainingMS = d.Milliseconds() + } + } + rewriteBudgetTokens := db.vlogGenerationRewriteBudgetTokensBytes.Load() + if rewriteBudgetTokens < 0 { + rewriteBudgetTokens = 0 + } + rewriteBudgetCap := db.vlogGenerationRewriteBudgetCapBytes() + if rewriteBudgetCap < 0 { + rewriteBudgetCap = 0 + } + rewriteBudgetUtilPct := 0.0 + if rewriteBudgetCap > 0 { + rewriteBudgetUtilPct = (float64(rewriteBudgetTokens) / float64(rewriteBudgetCap)) * 100.0 + if rewriteBudgetUtilPct > 100.0 { + rewriteBudgetUtilPct = 100.0 + } + } + maintenancePassTotalNS := db.vlogGenerationMaintenancePassTotalNanos.Load() + maintenancePassMaxNS := db.vlogGenerationMaintenancePassMaxNanos.Load() + maintenancePasses := db.vlogGenerationMaintenanceAcquired.Load() + rewritePlanTotalNS := db.vlogGenerationRewritePlanTotalNanos.Load() + rewritePlanMaxNS := db.vlogGenerationRewritePlanMaxNanos.Load() + rewritePlanRuns := db.vlogGenerationRewritePlanRuns.Load() + rewriteExecTotalNS := db.vlogGenerationRewriteExecTotalNanos.Load() + rewriteExecMaxNS := db.vlogGenerationRewriteExecMaxNanos.Load() + rewriteRuns := db.vlogGenerationRewriteRuns.Load() + gcExecTotalNS := db.vlogGenerationGCExecTotalNanos.Load() + gcExecMaxNS := db.vlogGenerationGCExecMaxNanos.Load() + gcRuns := db.vlogGenerationGCRuns.Load() + vacuumExecTotalNS := db.vlogGenerationVacuumExecTotalNanos.Load() + vacuumExecMaxNS := db.vlogGenerationVacuumExecMaxNanos.Load() + vacuumRuns := db.vlogGenerationVacuumRuns.Load() stats["treedb.cache.vlog_retained_segments"] = fmt.Sprintf("%d", vlogSegments) stats["treedb.cache.vlog_retained_bytes_estimate"] = fmt.Sprintf("%d", vlogBytes) stats["treedb.process.memory.vlog_retained_bytes_estimate"] = fmt.Sprintf("%d", vlogBytes) @@ -19597,15 +19728,40 @@ func (db *DB) Stats() map[string]string { stats["treedb.cache.vlog_generation.maintenance.passes.noop"] = fmt.Sprintf("%d", db.vlogGenerationMaintenancePassNoop.Load()) stats["treedb.cache.vlog_generation.maintenance.passes.with_rewrite"] = fmt.Sprintf("%d", db.vlogGenerationMaintenancePassWithRewrite.Load()) stats["treedb.cache.vlog_generation.maintenance.passes.with_gc"] = fmt.Sprintf("%d", db.vlogGenerationMaintenancePassWithGC.Load()) + stats["treedb.cache.vlog_generation.maintenance.pass.total_ms"] = fmt.Sprintf("%.3f", float64(maintenancePassTotalNS)/float64(time.Millisecond)) + stats["treedb.cache.vlog_generation.maintenance.pass.max_ms"] = fmt.Sprintf("%.3f", float64(maintenancePassMaxNS)/float64(time.Millisecond)) + if maintenancePasses > 0 { + stats["treedb.cache.vlog_generation.maintenance.pass.avg_ms"] = fmt.Sprintf("%.3f", (float64(maintenancePassTotalNS)/float64(maintenancePasses))/float64(time.Millisecond)) + } else { + stats["treedb.cache.vlog_generation.maintenance.pass.avg_ms"] = "0.000" + } stats["treedb.cache.vlog_generation.churn_bytes_total"] = fmt.Sprintf("%d", db.vlogGenerationChurnBytes.Load()) stats["treedb.cache.vlog_generation.churn_bytes_per_sec"] = fmt.Sprintf("%d", db.vlogGenerationLastChurnBps.Load()) stats["treedb.cache.vlog_generation.rewrite.queue_len"] = fmt.Sprintf("%d", rewriteQueueLen) stats["treedb.cache.vlog_generation.rewrite.queue_loaded"] = fmt.Sprintf("%t", rewriteQueueLoaded) + stats["treedb.cache.vlog_generation.rewrite.ledger_segments"] = fmt.Sprintf("%d", rewriteLedgerSegments) + stats["treedb.cache.vlog_generation.rewrite.ledger_bytes_total"] = fmt.Sprintf("%d", rewriteLedgerBytesTotal) + stats["treedb.cache.vlog_generation.rewrite.ledger_bytes_live"] = fmt.Sprintf("%d", rewriteLedgerBytesLive) + stats["treedb.cache.vlog_generation.rewrite.ledger_bytes_stale"] = fmt.Sprintf("%d", rewriteLedgerBytesStale) + if rewriteLedgerBytesTotal > 0 { + stats["treedb.cache.vlog_generation.rewrite.ledger_stale_ratio_ppm"] = fmt.Sprintf("%d", (rewriteLedgerBytesStale*1_000_000)/rewriteLedgerBytesTotal) + } else { + stats["treedb.cache.vlog_generation.rewrite.ledger_stale_ratio_ppm"] = "0" + } + stats["treedb.cache.vlog_generation.rewrite.stage_pending"] = fmt.Sprintf("%t", rewriteStagePending) + stats["treedb.cache.vlog_generation.rewrite.stage_observed_unix_nano"] = fmt.Sprintf("%d", rewriteStageObservedNS) + stats["treedb.cache.vlog_generation.rewrite.penalties_active"] = fmt.Sprintf("%d", rewritePenaltiesActive) + stats["treedb.cache.vlog_generation.rewrite.age_blocked_until_unix_nano"] = fmt.Sprintf("%d", rewriteAgeBlockedUntilNS) + stats["treedb.cache.vlog_generation.rewrite.age_blocked_remaining_ms"] = fmt.Sprintf("%d", rewriteAgeBlockedRemainingMS) stats["treedb.cache.vlog_generation.hot.segment_target_bytes"] = fmt.Sprintf("%d", db.valueLogGenerationHotTarget) stats["treedb.cache.vlog_generation.warm.segment_target_bytes"] = fmt.Sprintf("%d", db.valueLogGenerationWarmTarget) stats["treedb.cache.vlog_generation.cold.segment_target_bytes"] = fmt.Sprintf("%d", db.valueLogGenerationColdTarget) stats["treedb.cache.vlog_generation.rewrite_budget.bytes_per_sec"] = fmt.Sprintf("%d", db.valueLogRewriteBudgetBytes) stats["treedb.cache.vlog_generation.rewrite_budget.records_per_sec"] = fmt.Sprintf("%d", db.valueLogRewriteBudgetRecords) + stats["treedb.cache.vlog_generation.rewrite_budget.tokens_bytes"] = fmt.Sprintf("%d", rewriteBudgetTokens) + stats["treedb.cache.vlog_generation.rewrite_budget.tokens_cap_bytes"] = fmt.Sprintf("%d", rewriteBudgetCap) + stats["treedb.cache.vlog_generation.rewrite_budget.tokens_utilization_pct"] = fmt.Sprintf("%.3f", rewriteBudgetUtilPct) + stats["treedb.cache.vlog_generation.rewrite_budget.consumed_bytes_total"] = fmt.Sprintf("%d", db.vlogGenerationRewriteBudgetConsumed.Load()) stats["treedb.cache.vlog_generation.rewrite_trigger.stale_ratio_ppm"] = fmt.Sprintf("%d", db.valueLogRewriteTriggerRatioPPM) stats["treedb.cache.vlog_generation.rewrite_trigger.total_bytes"] = fmt.Sprintf("%d", db.valueLogRewriteTriggerBytes) stats["treedb.cache.vlog_generation.rewrite_trigger.churn_per_sec"] = fmt.Sprintf("%d", db.valueLogRewriteTriggerChurn) @@ -19649,11 +19805,32 @@ func (db *DB) Stats() map[string]string { stats["treedb.cache.vlog_generation.rewrite.ineffective_last_unix_nano"] = fmt.Sprintf("%d", db.vlogGenerationRewriteIneffectiveLastNS.Load()) stats["treedb.cache.vlog_generation.rewrite.ineffective_backoff_seconds"] = fmt.Sprintf("%.0f", vlogGenerationRewriteIneffectiveBackoff.Seconds()) stats["treedb.cache.vlog_generation.rewrite.reclaimed_bytes"] = fmt.Sprintf("%d", db.vlogGenerationRewriteReclaimedBytes.Load()) + stats["treedb.cache.vlog_generation.rewrite.plan.total_ms"] = fmt.Sprintf("%.3f", float64(rewritePlanTotalNS)/float64(time.Millisecond)) + stats["treedb.cache.vlog_generation.rewrite.plan.max_ms"] = fmt.Sprintf("%.3f", float64(rewritePlanMaxNS)/float64(time.Millisecond)) + if rewritePlanRuns > 0 { + stats["treedb.cache.vlog_generation.rewrite.plan.avg_ms"] = fmt.Sprintf("%.3f", (float64(rewritePlanTotalNS)/float64(rewritePlanRuns))/float64(time.Millisecond)) + } else { + stats["treedb.cache.vlog_generation.rewrite.plan.avg_ms"] = "0.000" + } + stats["treedb.cache.vlog_generation.rewrite.exec.total_ms"] = fmt.Sprintf("%.3f", float64(rewriteExecTotalNS)/float64(time.Millisecond)) + stats["treedb.cache.vlog_generation.rewrite.exec.max_ms"] = fmt.Sprintf("%.3f", float64(rewriteExecMaxNS)/float64(time.Millisecond)) + if rewriteRuns > 0 { + stats["treedb.cache.vlog_generation.rewrite.exec.avg_ms"] = fmt.Sprintf("%.3f", (float64(rewriteExecTotalNS)/float64(rewriteRuns))/float64(time.Millisecond)) + } else { + stats["treedb.cache.vlog_generation.rewrite.exec.avg_ms"] = "0.000" + } stats["treedb.cache.vlog_generation.rewrite.plan_last_unix_nano"] = fmt.Sprintf("%d", db.vlogGenerationLastRewritePlanUnixNano.Load()) stats["treedb.cache.vlog_generation.rewrite.last_unix_nano"] = fmt.Sprintf("%d", db.vlogGenerationLastRewriteUnixNano.Load()) stats["treedb.cache.vlog_generation.gc.deleted_segments"] = fmt.Sprintf("%d", db.vlogGenerationGCSegmentsDeleted.Load()) stats["treedb.cache.vlog_generation.gc.deleted_bytes"] = fmt.Sprintf("%d", db.vlogGenerationGCBytesDeleted.Load()) stats["treedb.cache.vlog_generation.gc.runs"] = fmt.Sprintf("%d", db.vlogGenerationGCRuns.Load()) + stats["treedb.cache.vlog_generation.gc.exec.total_ms"] = fmt.Sprintf("%.3f", float64(gcExecTotalNS)/float64(time.Millisecond)) + stats["treedb.cache.vlog_generation.gc.exec.max_ms"] = fmt.Sprintf("%.3f", float64(gcExecMaxNS)/float64(time.Millisecond)) + if gcRuns > 0 { + stats["treedb.cache.vlog_generation.gc.exec.avg_ms"] = fmt.Sprintf("%.3f", (float64(gcExecTotalNS)/float64(gcRuns))/float64(time.Millisecond)) + } else { + stats["treedb.cache.vlog_generation.gc.exec.avg_ms"] = "0.000" + } stats["treedb.cache.vlog_generation.gc.last_unix_nano"] = fmt.Sprintf("%d", db.vlogGenerationLastGCUnixNano.Load()) stats["treedb.cache.vlog_generation.gc.dry_run.last_unix_nano"] = fmt.Sprintf("%d", db.vlogGenerationLastGCDryRunUnixNano.Load()) stats["treedb.cache.vlog_generation.gc.dry_run.last_eligible_bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCDryRunBytesEligible.Load()) @@ -19663,6 +19840,13 @@ func (db *DB) Stats() map[string]string { stats["treedb.cache.vlog_generation.vacuum.skipped_disabled"] = fmt.Sprintf("%d", db.vlogGenerationVacuumSkippedDisabled.Load()) stats["treedb.cache.vlog_generation.vacuum.skipped_rewrite_bytes"] = fmt.Sprintf("%d", db.vlogGenerationVacuumSkippedRewriteBytes.Load()) stats["treedb.cache.vlog_generation.vacuum.skipped_cooldown"] = fmt.Sprintf("%d", db.vlogGenerationVacuumSkippedCooldown.Load()) + stats["treedb.cache.vlog_generation.vacuum.exec.total_ms"] = fmt.Sprintf("%.3f", float64(vacuumExecTotalNS)/float64(time.Millisecond)) + stats["treedb.cache.vlog_generation.vacuum.exec.max_ms"] = fmt.Sprintf("%.3f", float64(vacuumExecMaxNS)/float64(time.Millisecond)) + if vacuumRuns > 0 { + stats["treedb.cache.vlog_generation.vacuum.exec.avg_ms"] = fmt.Sprintf("%.3f", (float64(vacuumExecTotalNS)/float64(vacuumRuns))/float64(time.Millisecond)) + } else { + stats["treedb.cache.vlog_generation.vacuum.exec.avg_ms"] = "0.000" + } stats["treedb.cache.vlog_generation.vacuum.last_unix_nano"] = fmt.Sprintf("%d", db.vlogGenerationLastVacuumUnixNano.Load()) stats["treedb.cache.vlog_generation.remap.successes"] = fmt.Sprintf("%d", db.vlogGenerationRemapSuccesses.Load()) stats["treedb.cache.vlog_generation.remap.failures"] = fmt.Sprintf("%d", db.vlogGenerationRemapFailures.Load()) diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go index 1de7c37bb..c98a0b425 100644 --- a/TreeDB/caching/vlog_generation_scheduler_test.go +++ b/TreeDB/caching/vlog_generation_scheduler_test.go @@ -5598,3 +5598,124 @@ func TestVlogGenerationGC_SkipsDuringRecentForegroundWrites(t *testing.T) { t.Fatalf("gc calls=%d/%d want 0/0 while foreground writes are hot", dryRunCalls, realCalls) } } + +func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) { + prepareDirectSchedulerTest(t) + + dir := t.TempDir() + backend, err := backenddb.Open(backenddb.Options{Dir: dir}) + if err != nil { + t.Fatalf("open backend: %v", err) + } + recorder := &rewriteBudgetRecordingBackend{DB: backend} + + db, cleanup := openRewriteQueueTestDB(t, dir, recorder) + defer cleanup() + + db.vlogGenerationMaintenanceAcquired.Store(2) + db.vlogGenerationMaintenancePassTotalNanos.Store(uint64((40 * time.Millisecond).Nanoseconds())) + db.vlogGenerationMaintenancePassMaxNanos.Store(uint64((30 * time.Millisecond).Nanoseconds())) + db.vlogGenerationRewritePlanRuns.Store(4) + db.vlogGenerationRewritePlanTotalNanos.Store(uint64((80 * time.Millisecond).Nanoseconds())) + db.vlogGenerationRewritePlanMaxNanos.Store(uint64((50 * time.Millisecond).Nanoseconds())) + db.vlogGenerationRewriteRuns.Store(3) + db.vlogGenerationRewriteExecTotalNanos.Store(uint64((150 * time.Millisecond).Nanoseconds())) + db.vlogGenerationRewriteExecMaxNanos.Store(uint64((70 * time.Millisecond).Nanoseconds())) + db.vlogGenerationGCRuns.Store(2) + db.vlogGenerationGCExecTotalNanos.Store(uint64((60 * time.Millisecond).Nanoseconds())) + db.vlogGenerationGCExecMaxNanos.Store(uint64((35 * time.Millisecond).Nanoseconds())) + db.vlogGenerationVacuumRuns.Store(2) + db.vlogGenerationVacuumExecTotalNanos.Store(uint64((44 * time.Millisecond).Nanoseconds())) + db.vlogGenerationVacuumExecMaxNanos.Store(uint64((25 * time.Millisecond).Nanoseconds())) + db.vlogGenerationRewriteBudgetTokensBytes.Store(512) + db.vlogGenerationRewriteBudgetConsumed.Store(1536) + db.vlogGenerationRewriteAgeBlockedUntilNS.Store(time.Now().Add(5 * time.Second).UnixNano()) + + db.vlogGenerationRewriteQueueMu.Lock() + db.vlogGenerationRewriteQueueLoaded = true + db.vlogGenerationRewriteQueue = []uint32{11, 12} + db.vlogGenerationRewriteLedger = []backenddb.ValueLogRewritePlanSegment{ + {FileID: 11, BytesTotal: 1000, BytesLive: 700, BytesStale: 300}, + {FileID: 12, BytesTotal: 500, BytesLive: 500, BytesStale: 0}, + } + db.vlogGenerationRewritePenalties = map[uint32]valueLogGenerationRewritePenalty{ + 11: {Attempts: 1, CooldownUntilUnixNano: time.Now().Add(time.Minute).UnixNano()}, + } + db.vlogGenerationRewriteStagePending = true + db.vlogGenerationRewriteStageObservedUnixNano = 1234 + db.vlogGenerationRewriteQueueMu.Unlock() + + stats := db.Stats() + if got := stats["treedb.cache.vlog_generation.maintenance.pass.total_ms"]; got != "40.000" { + t.Fatalf("maintenance pass total ms=%q want 40.000", got) + } + if got := stats["treedb.cache.vlog_generation.maintenance.pass.max_ms"]; got != "30.000" { + t.Fatalf("maintenance pass max ms=%q want 30.000", got) + } + if got := stats["treedb.cache.vlog_generation.maintenance.pass.avg_ms"]; got != "20.000" { + t.Fatalf("maintenance pass avg ms=%q want 20.000", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.plan.total_ms"]; got != "80.000" { + t.Fatalf("rewrite plan total ms=%q want 80.000", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.plan.avg_ms"]; got != "20.000" { + t.Fatalf("rewrite plan avg ms=%q want 20.000", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.exec.total_ms"]; got != "150.000" { + t.Fatalf("rewrite exec total ms=%q want 150.000", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.exec.avg_ms"]; got != "50.000" { + t.Fatalf("rewrite exec avg ms=%q want 50.000", got) + } + if got := stats["treedb.cache.vlog_generation.gc.exec.total_ms"]; got != "60.000" { + t.Fatalf("gc exec total ms=%q want 60.000", got) + } + if got := stats["treedb.cache.vlog_generation.gc.exec.avg_ms"]; got != "30.000" { + t.Fatalf("gc exec avg ms=%q want 30.000", got) + } + if got := stats["treedb.cache.vlog_generation.vacuum.exec.total_ms"]; got != "44.000" { + t.Fatalf("vacuum exec total ms=%q want 44.000", got) + } + if got := stats["treedb.cache.vlog_generation.vacuum.exec.avg_ms"]; got != "22.000" { + t.Fatalf("vacuum exec avg ms=%q want 22.000", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.ledger_segments"]; got != "2" { + t.Fatalf("rewrite ledger segments=%q want 2", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.ledger_bytes_total"]; got != "1500" { + t.Fatalf("rewrite ledger bytes total=%q want 1500", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.ledger_bytes_live"]; got != "1200" { + t.Fatalf("rewrite ledger bytes live=%q want 1200", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.ledger_bytes_stale"]; got != "300" { + t.Fatalf("rewrite ledger bytes stale=%q want 300", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.ledger_stale_ratio_ppm"]; got != "200000" { + t.Fatalf("rewrite ledger stale ratio ppm=%q want 200000", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.stage_pending"]; got != "true" { + t.Fatalf("rewrite stage pending=%q want true", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.stage_observed_unix_nano"]; got != "1234" { + t.Fatalf("rewrite stage observed=%q want 1234", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.penalties_active"]; got != "1" { + t.Fatalf("rewrite penalties active=%q want 1", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.age_blocked_remaining_ms"]; got == "0" { + t.Fatalf("rewrite age blocked remaining ms=%q want >0", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite_budget.tokens_bytes"]; got != "512" { + t.Fatalf("rewrite budget tokens bytes=%q want 512", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite_budget.consumed_bytes_total"]; got != "1536" { + t.Fatalf("rewrite budget consumed=%q want 1536", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite_budget.tokens_cap_bytes"]; got == "0" { + t.Fatalf("rewrite budget cap bytes=%q want non-zero", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite_budget.tokens_utilization_pct"]; got == "" { + t.Fatalf("rewrite budget utilization pct missing") + } +} diff --git a/worklog/2026-03-27.md b/worklog/2026-03-27.md index b41f022dd..9eebb55ec 100644 --- a/worklog/2026-03-27.md +++ b/worklog/2026-03-27.md @@ -117,3 +117,31 @@ - Interpretation: - Retry coalescing delivered the major contention reduction. - Periodic preflight further reduced maintenance churn/noop entries while preserving rewrite+vacuum progress in this early-state-sync window. + +- Instrumentation-first follow-up for incremental rewrite economics: + - code changes: + - `TreeDB/caching/db.go` + - added maintenance/rewrite/gc/vacuum duration counters and stats: + - `treedb.cache.vlog_generation.maintenance.pass.{total,max,avg}_ms` + - `treedb.cache.vlog_generation.rewrite.plan.{total,max,avg}_ms` + - `treedb.cache.vlog_generation.rewrite.exec.{total,max,avg}_ms` + - `treedb.cache.vlog_generation.gc.exec.{total,max,avg}_ms` + - `treedb.cache.vlog_generation.vacuum.exec.{total,max,avg}_ms` + - added rewrite backlog/debt visibility stats: + - `treedb.cache.vlog_generation.rewrite.ledger_segments` + - `treedb.cache.vlog_generation.rewrite.ledger_bytes_{total,live,stale}` + - `treedb.cache.vlog_generation.rewrite.ledger_stale_ratio_ppm` + - `treedb.cache.vlog_generation.rewrite.stage_{pending,observed_unix_nano}` + - `treedb.cache.vlog_generation.rewrite.penalties_active` + - `treedb.cache.vlog_generation.rewrite.age_blocked_{until_unix_nano,remaining_ms}` + - added rewrite budget execution stats: + - `treedb.cache.vlog_generation.rewrite_budget.tokens_{bytes,cap_bytes}` + - `treedb.cache.vlog_generation.rewrite_budget.tokens_utilization_pct` + - `treedb.cache.vlog_generation.rewrite_budget.consumed_bytes_total` + - tracked rewrite-budget token consumption inside `vlogGenerationConsumeRewriteBudgetBytes`. + - `TreeDB/caching/vlog_generation_scheduler_test.go` + - added `TestVlogGenerationStats_ReportRewriteBacklogAndDurations`. + +- Validation: + - `go test ./TreeDB/caching -run TestVlogGenerationStats_ReportRewriteBacklogAndDurations -count=1` + - `go test ./TreeDB/caching -count=1` From 7cc50d6de34c87cc60d6991e308d96bb03ad62df Mon Sep 17 00:00:00 2001 From: Mikers Date: Fri, 27 Mar 2026 18:47:07 -1000 Subject: [PATCH 06/61] treedb: stop maintenance retry collision amplification --- TreeDB/caching/db.go | 36 +++++---------- .../caching/vlog_generation_scheduler_test.go | 13 ++++++ worklog/2026-03-27.md | 44 +++++++++++++++++++ 3 files changed, 68 insertions(+), 25 deletions(-) diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go index 20c877be7..256613925 100644 --- a/TreeDB/caching/db.go +++ b/TreeDB/caching/db.go @@ -13047,35 +13047,21 @@ func (db *DB) runVlogGenerationMaintenanceRetries(opts vlogGenerationMaintenance deadline := time.Now().Add(retryWindow) sleepDelay := 10 * time.Millisecond for !db.closing.Load() { - // Once retry intent is already queued, avoid repeatedly colliding with - // the active maintenance pass; wait for release or deadline instead. + // Retry loops should never hammer an already-active maintenance pass. + // Wait for release/deadline instead of repeatedly colliding and inflating + // maintenance.attempts/collisions under hot checkpoint-kick activity. if db.vlogGenerationMaintenanceActive.Load() { - if stopWhenAcquired && db.vlogGenerationDeferredMaintenancePending.Load() { - if time.Now().After(deadline) { - return - } - time.Sleep(sleepDelay) - if sleepDelay < 100*time.Millisecond { - sleepDelay *= 2 - if sleepDelay > 100*time.Millisecond { - sleepDelay = 100 * time.Millisecond - } - } - continue + if time.Now().After(deadline) { + return } - if !stopWhenAcquired && db.vlogGenerationCheckpointKickPending.Load() { - if time.Now().After(deadline) { - return - } - time.Sleep(sleepDelay) - if sleepDelay < 100*time.Millisecond { - sleepDelay *= 2 - if sleepDelay > 100*time.Millisecond { - sleepDelay = 100 * time.Millisecond - } + time.Sleep(sleepDelay) + if sleepDelay < 100*time.Millisecond { + sleepDelay *= 2 + if sleepDelay > 100*time.Millisecond { + sleepDelay = 100 * time.Millisecond } - continue } + continue } attempt++ if opts.debugSource != "" { diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go index c98a0b425..7bb0453e3 100644 --- a/TreeDB/caching/vlog_generation_scheduler_test.go +++ b/TreeDB/caching/vlog_generation_scheduler_test.go @@ -392,6 +392,19 @@ func TestRunVlogGenerationMaintenanceRetries_CoalescesPendingCollisionRetries(t t.Fatalf("checkpoint pending retry collisions=%d want=0", got) } + db.vlogGenerationMaintenanceActive.Store(true) + db.vlogGenerationCheckpointKickPending.Store(false) + db.runVlogGenerationMaintenanceRetries(vlogGenerationMaintenanceOptions{ + bypassQuiet: true, + skipRetainedPruneWait: true, + skipCheckpoint: false, + rewriteDebtDrain: true, + debugSource: "checkpoint_pending", + }, 30*time.Millisecond, false) + if got := db.vlogGenerationMaintenanceCollisions.Load(); got != 0 { + t.Fatalf("checkpoint retry collisions while active=%d want=0", got) + } + db.vlogGenerationMaintenanceActive.Store(true) db.vlogGenerationDeferredMaintenancePending.Store(true) db.runVlogGenerationMaintenanceRetries(vlogGenerationMaintenanceOptions{ diff --git a/worklog/2026-03-27.md b/worklog/2026-03-27.md index 9eebb55ec..b97f3e2e0 100644 --- a/worklog/2026-03-27.md +++ b/worklog/2026-03-27.md @@ -145,3 +145,47 @@ - Validation: - `go test ./TreeDB/caching -run TestVlogGenerationStats_ReportRewriteBacklogAndDurations -count=1` - `go test ./TreeDB/caching -count=1` + +- `run_celestia` instrumentation readout (application.db instance in expvar snapshots): + - run (`STOP_AT_LOCAL_HEIGHT=500`): + - home: `/home/mikers/.celestia-app-mainnet-treedb-20260327181906` + - key counters at peak snapshot: + - `maintenance.attempts=45`, `acquired=41`, `collisions=4` + - `rewrite.plan_runs=2`, `rewrite.plan_selected=2`, `rewrite.runs=1` + - `gc.runs=1`, `vacuum.runs=1` + - `rewrite_budget.consumed_bytes_total=33073153` + - offline compaction sanity check on that run: + - pre: `du -sb application.db = 4679915182` + - `treemap vlog-rewrite ... -rw` output: `segments_before=20 segments_after=15 bytes_before=4607146646 bytes_after=1983182186 records=957832` + - post: `du -sb application.db = 2021086813` + +- Longer-window stress run exposed retry-collision amplification: + - run (`STOP_AT_LOCAL_HEIGHT=2000`): + - home: `/home/mikers/.celestia-app-mainnet-treedb-20260327182903` + - timeline showed `acquired` flat while attempts/collisions spiked: + - snapshot progression reached `maintenance.attempts=333`, `collisions=304`, `acquired=29` + - `rewrite.plan_runs=3` but `rewrite.plan_selected=0`, `rewrite.runs=0` + - `checkpoint_kick.pending=true` persisted during collision growth + +- Fix for retry-collision amplification: + - code changes: + - `TreeDB/caching/db.go` + - `runVlogGenerationMaintenanceRetries`: when `maintenanceActive` is true, always back off/wait until release/deadline instead of conditionally attempting based on pending flags. + - This avoids high-frequency CAS collisions from checkpoint-kick retry goroutines while a long maintenance pass is active. + - `TreeDB/caching/vlog_generation_scheduler_test.go` + - extended `TestRunVlogGenerationMaintenanceRetries_CoalescesPendingCollisionRetries` with checkpoint-pending=false + active pass case to prevent regression. + - validation: + - `go test ./TreeDB/caching -run TestRunVlogGenerationMaintenanceRetries_CoalescesPendingCollisionRetries -count=1` + - `go test ./TreeDB/caching -count=1` + +- Confirmation run after fix: + - run (`STOP_AT_LOCAL_HEIGHT=2000`): + - home: `/home/mikers/.celestia-app-mainnet-treedb-20260327184030` + - comparable snapshot (`20260327184427`) vs pre-fix bad snapshot (`20260327183236`): + - `maintenance.attempts: 333 -> 38` + - `maintenance.acquired: 29 -> 38` + - `maintenance.collisions: 304 -> 0` + - `rewrite.plan_selected: 0 -> 2` + - `rewrite.runs: 0 -> 1` + - `vacuum.runs: 0 -> 1` + - `rewrite_budget.consumed_bytes_total: 0 -> 33073906` From 00355572957bb6532eec535392a4102c4df1287f Mon Sep 17 00:00:00 2001 From: Mikers Date: Fri, 27 Mar 2026 19:30:09 -1000 Subject: [PATCH 07/61] treedb: add stage-gate and rewrite segment counters --- TreeDB/caching/db.go | 25 ++++++++++ .../caching/vlog_generation_scheduler_test.go | 16 +++++++ worklog/2026-03-27.md | 47 +++++++++++++++++++ 3 files changed, 88 insertions(+) diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go index 256613925..90e2119f6 100644 --- a/TreeDB/caching/db.go +++ b/TreeDB/caching/db.go @@ -5181,6 +5181,7 @@ type DB struct { vlogGenerationRewritePlanErrors atomic.Uint64 vlogGenerationRewritePlanEmpty atomic.Uint64 vlogGenerationRewritePlanSelected atomic.Uint64 + vlogGenerationRewritePlanSelectedSegments atomic.Uint64 vlogGenerationRewritePlanSelectedBytes atomic.Uint64 vlogGenerationRewritePlanSelectedLiveBytes atomic.Uint64 vlogGenerationRewritePlanSelectedStaleBytes atomic.Uint64 @@ -5220,6 +5221,8 @@ type DB struct { vlogGenerationMaintenanceSkipWALOnPeriodic atomic.Uint64 vlogGenerationMaintenanceSkipPhase atomic.Uint64 vlogGenerationMaintenanceSkipStageGate atomic.Uint64 + vlogGenerationMaintenanceSkipStageNotDue atomic.Uint64 + vlogGenerationMaintenanceSkipStageDue atomic.Uint64 vlogGenerationMaintenanceSkipAgeBlocked atomic.Uint64 vlogGenerationMaintenanceSkipPriority atomic.Uint64 vlogGenerationMaintenanceSkipQuiet atomic.Uint64 @@ -5259,6 +5262,7 @@ type DB struct { vlogGenerationRewritePlanMaxNanos atomic.Uint64 vlogGenerationRewriteExecTotalNanos atomic.Uint64 vlogGenerationRewriteExecMaxNanos atomic.Uint64 + vlogGenerationRewriteExecSourceSegments atomic.Uint64 vlogGenerationGCExecTotalNanos atomic.Uint64 vlogGenerationGCExecMaxNanos atomic.Uint64 vlogGenerationVacuumExecTotalNanos atomic.Uint64 @@ -12531,6 +12535,18 @@ func (db *DB) observeVlogGenerationRewritePlanOutcomeWithDuration(plan backenddb } if len(plan.SourceFileIDs) > 0 || len(plan.SelectedSegments) > 0 || plan.SegmentsSelected > 0 { db.vlogGenerationRewritePlanSelected.Add(1) + selectedSegments := plan.SegmentsSelected + if selectedSegments <= 0 { + switch { + case len(plan.SelectedSegments) > 0: + selectedSegments = len(plan.SelectedSegments) + case len(plan.SourceFileIDs) > 0: + selectedSegments = len(plan.SourceFileIDs) + } + } + if selectedSegments > 0 { + db.vlogGenerationRewritePlanSelectedSegments.Add(uint64(selectedSegments)) + } selectedTotal := plan.SelectedBytesTotal selectedLive := plan.SelectedBytesLive selectedStale := plan.SelectedBytesStale @@ -13257,6 +13273,7 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog // has elapsed. The only valid next step is to wait for confirmation. if !vlogGenerationIsStageConfirmSource(opts) { db.vlogGenerationMaintenanceSkipStageGate.Add(1) + db.vlogGenerationMaintenanceSkipStageNotDue.Add(1) return } } else if !vlogGenerationIsStageConfirmSource(opts) { @@ -13264,6 +13281,7 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog // explicit stage-confirm wake instead of letting generic retries or // periodic passes reacquire it first. db.vlogGenerationMaintenanceSkipStageGate.Add(1) + db.vlogGenerationMaintenanceSkipStageDue.Add(1) return } } @@ -14008,6 +14026,9 @@ planned: } db.vlogGenerationSchedulerState.Store(vlogGenerationSchedulerIdle) db.vlogGenerationRewriteRuns.Add(1) + if sourceSegments := len(rewriteOpts.SourceFileIDs); sourceSegments > 0 { + db.vlogGenerationRewriteExecSourceSegments.Add(uint64(sourceSegments)) + } rewriteBytesIn := int64(0) if processedLedgerOK { rewriteBytesIn = processedLedgerLiveBytes @@ -19706,6 +19727,8 @@ func (db *DB) Stats() map[string]string { stats["treedb.cache.vlog_generation.maintenance.skip.wal_on_periodic"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceSkipWALOnPeriodic.Load()) stats["treedb.cache.vlog_generation.maintenance.skip.maintenance_phase"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceSkipPhase.Load()) stats["treedb.cache.vlog_generation.maintenance.skip.stage_gate"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceSkipStageGate.Load()) + stats["treedb.cache.vlog_generation.maintenance.skip.stage_gate_not_due"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceSkipStageNotDue.Load()) + stats["treedb.cache.vlog_generation.maintenance.skip.stage_gate_due_reserved"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceSkipStageDue.Load()) stats["treedb.cache.vlog_generation.maintenance.skip.age_blocked_gate"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceSkipAgeBlocked.Load()) stats["treedb.cache.vlog_generation.maintenance.skip.priority_pending"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceSkipPriority.Load()) stats["treedb.cache.vlog_generation.maintenance.skip.quiet_window"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceSkipQuiet.Load()) @@ -19778,9 +19801,11 @@ func (db *DB) Stats() map[string]string { stats["treedb.cache.vlog_generation.rewrite.plan_errors"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanErrors.Load()) stats["treedb.cache.vlog_generation.rewrite.plan_empty"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanEmpty.Load()) stats["treedb.cache.vlog_generation.rewrite.plan_selected"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanSelected.Load()) + stats["treedb.cache.vlog_generation.rewrite.plan_selected_segments_total"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanSelectedSegments.Load()) stats["treedb.cache.vlog_generation.rewrite.plan_selected_bytes_total"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanSelectedBytes.Load()) stats["treedb.cache.vlog_generation.rewrite.plan_selected_bytes_live"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanSelectedLiveBytes.Load()) stats["treedb.cache.vlog_generation.rewrite.plan_selected_bytes_stale"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanSelectedStaleBytes.Load()) + stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_total"] = fmt.Sprintf("%d", db.vlogGenerationRewriteExecSourceSegments.Load()) stats["treedb.cache.vlog_generation.rewrite.canceled_runs"] = fmt.Sprintf("%d", db.vlogGenerationRewriteCanceledRuns.Load()) stats["treedb.cache.vlog_generation.rewrite.canceled_last_unix_nano"] = fmt.Sprintf("%d", db.vlogGenerationRewriteCanceledLastNS.Load()) stats["treedb.cache.vlog_generation.rewrite.queue_prune_runs"] = fmt.Sprintf("%d", db.vlogGenerationRewriteQueuePruneRuns.Load()) diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go index 7bb0453e3..34d81d1a6 100644 --- a/TreeDB/caching/vlog_generation_scheduler_test.go +++ b/TreeDB/caching/vlog_generation_scheduler_test.go @@ -5643,6 +5643,10 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) { db.vlogGenerationRewriteBudgetTokensBytes.Store(512) db.vlogGenerationRewriteBudgetConsumed.Store(1536) db.vlogGenerationRewriteAgeBlockedUntilNS.Store(time.Now().Add(5 * time.Second).UnixNano()) + db.vlogGenerationMaintenanceSkipStageNotDue.Store(5) + db.vlogGenerationMaintenanceSkipStageDue.Store(2) + db.vlogGenerationRewritePlanSelectedSegments.Store(6) + db.vlogGenerationRewriteExecSourceSegments.Store(3) db.vlogGenerationRewriteQueueMu.Lock() db.vlogGenerationRewriteQueueLoaded = true @@ -5731,4 +5735,16 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) { if got := stats["treedb.cache.vlog_generation.rewrite_budget.tokens_utilization_pct"]; got == "" { t.Fatalf("rewrite budget utilization pct missing") } + if got := stats["treedb.cache.vlog_generation.maintenance.skip.stage_gate_not_due"]; got != "5" { + t.Fatalf("maintenance skip stage gate not due=%q want 5", got) + } + if got := stats["treedb.cache.vlog_generation.maintenance.skip.stage_gate_due_reserved"]; got != "2" { + t.Fatalf("maintenance skip stage gate due reserved=%q want 2", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.plan_selected_segments_total"]; got != "6" { + t.Fatalf("rewrite plan selected segments total=%q want 6", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_total"]; got != "3" { + t.Fatalf("rewrite exec source segments total=%q want 3", got) + } } diff --git a/worklog/2026-03-27.md b/worklog/2026-03-27.md index b97f3e2e0..29c5b7b66 100644 --- a/worklog/2026-03-27.md +++ b/worklog/2026-03-27.md @@ -189,3 +189,50 @@ - `rewrite.runs: 0 -> 1` - `vacuum.runs: 0 -> 1` - `rewrite_budget.consumed_bytes_total: 0 -> 33073906` + +- Stage-gate/selection observability follow-up (live rewrite throughput diagnosis): + - code changes: + - `TreeDB/caching/db.go` + - added stage-gate split counters: + - `treedb.cache.vlog_generation.maintenance.skip.stage_gate_not_due` + - `treedb.cache.vlog_generation.maintenance.skip.stage_gate_due_reserved` + - added rewrite selection/execution segment counters: + - `treedb.cache.vlog_generation.rewrite.plan_selected_segments_total` + - `treedb.cache.vlog_generation.rewrite.exec.source_segments_total` + - incremented counters in: + - stage-gate early-return branches (`not_due` vs `due_reserved`) + - rewrite-plan outcome accounting (selected segments) + - rewrite execution completion (source segments executed) + - `TreeDB/caching/vlog_generation_scheduler_test.go` + - extended `TestVlogGenerationStats_ReportRewriteBacklogAndDurations` assertions for new keys. + +- Validation: + - `go test ./TreeDB/caching -count=1` + +- `run_celestia` run with new counters (baseline fast profile, no profile trigger override): + - command: + - `LOCAL_GOMAP_DIR=/home/mikers/dev/snissn/gomap-phasehook-active TREEDB_OPEN_PROFILE=fast FREEZE_REMOTE_HEIGHT_AT_START=1 STOP_AT_LOCAL_HEIGHT=2000 ~/run_celestia.sh` + - run home: + - `/home/mikers/.celestia-app-mainnet-treedb-20260327192126` + - final snapshot: + - `pprof-heap-max-rss-final-7983364k-20260327192636.treedb_vars.json` + - key counters: + - `maintenance.attempts=38`, `acquired=38`, `collisions=0` + - `rewrite.plan_runs=2`, `rewrite.plan_selected=2` + - `rewrite.plan_selected_segments_total=3` + - `rewrite.runs=1`, `rewrite.exec.source_segments_total=1` + - `rewrite.bytes_in=33073442`, `rewrite.reclaimed_bytes=0` + - `maintenance.skip.stage_gate=7` + - `maintenance.skip.stage_gate_not_due=7` + - `maintenance.skip.stage_gate_due_reserved=0` + - interpretation: + - planner selected more segment debt than was executed in-run (`3 selected vs 1 executed`). + - stage gating was entirely waiting-for-confirmation (`not_due`), not due-slot reservation. + +- Offline rewrite delta for same run home: + - pre: `du -sb application.db = 4707839386` + - `treemap vlog-rewrite ... -rw` output: + - `segments_before=20 segments_after=16 bytes_before=4637168004 bytes_after=2039183405 records=964467` + - post: `du -sb application.db = 2077350273` + - interpretation: + - live run still leaves substantial reclaimable headroom; new counters indicate confirmation-gated debt progression as one concrete limiter. From 0dd0b18bbb5a6daa72363cb58731e0ba81fe7898 Mon Sep 17 00:00:00 2001 From: Mikers Date: Fri, 27 Mar 2026 19:59:59 -1000 Subject: [PATCH 08/61] treedb: speed staged rewrite debt progression --- TreeDB/caching/db.go | 18 +++-- .../caching/vlog_generation_scheduler_test.go | 70 +++++++++++++++++++ worklog/2026-03-27.md | 56 +++++++++++++++ 3 files changed, 139 insertions(+), 5 deletions(-) diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go index 90e2119f6..027e7ca12 100644 --- a/TreeDB/caching/db.go +++ b/TreeDB/caching/db.go @@ -12306,7 +12306,7 @@ func (db *DB) vlogGenerationRewriteMaxSegmentsForRun(queueLen int, budgetTokens } // Checkpoint-kick retries should keep each debt-drain run small to reduce // write amplification when foreground ingest is still active. - if opts.bypassQuiet && !opts.skipCheckpoint { + if opts.bypassQuiet && !opts.skipCheckpoint && !vlogGenerationIsStageConfirmSource(opts) && !vlogGenerationIsAgeBlockedSource(opts) { return 1 } maxSegments = queueLen @@ -13513,7 +13513,9 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog } confirmed := stableVlogGenerationRewriteLedgerSegments(stagedLedger, plan.SelectedSegments) if len(confirmed) > 0 { - plan = filterVlogGenerationRewritePlanToSegments(plan, confirmed) + // Treat confirmation overlap as a stability signal, then run + // the current sparse plan (not just the overlap subset) so live + // maintenance can make forward progress within short sync windows. shouldRewrite = true reason = vlogGenerationReasonRewriteResume } else { @@ -13790,9 +13792,15 @@ planned: } } rewriteQueue = append([]uint32(nil), rewritePlan.SourceFileIDs...) - // Do not debt-drain freshly planned work in the same pass; only apply - // multi-segment debt-drain to explicit resume queues. - rewriteMaxSegments = vlogGenerationRewriteResumeMaxSegments + // Do not debt-drain freshly planned work in the same pass. The only + // exception is a confirmed staged rewrite-resume pass, which should + // be allowed to consume debt in bounded multi-segment chunks. + allowPlanDebtDrain := reason == vlogGenerationReasonRewriteResume && opts.rewriteDebtDrain + if allowPlanDebtDrain { + rewriteMaxSegments = db.vlogGenerationRewriteMaxSegmentsForRun(len(rewriteQueue), budgetTokens, opts) + } else { + rewriteMaxSegments = vlogGenerationRewriteResumeMaxSegments + } // If the token bucket is enabled and empty, persist the plan/ledger but // skip running the rewrite until we have budget to spend. if db.vlogGenerationRewriteBudgetEnabled() && budgetTokens <= 0 { diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go index 34d81d1a6..99f715ef1 100644 --- a/TreeDB/caching/vlog_generation_scheduler_test.go +++ b/TreeDB/caching/vlog_generation_scheduler_test.go @@ -2778,6 +2778,76 @@ func TestVlogGenerationRewritePlan_StageConfirmationExecutesConfirmedSubset(t *t } } +func TestVlogGenerationRewritePlan_StageConfirmationDebtDrainProcessesMultipleSegments(t *testing.T) { + prepareDirectSchedulerTest(t) + + dir := t.TempDir() + backend, err := backenddb.Open(backenddb.Options{Dir: dir}) + if err != nil { + t.Fatalf("open backend: %v", err) + } + recorder := &rewriteBudgetRecordingBackend{ + DB: backend, + planResponse: backenddb.ValueLogRewritePlan{ + SourceFileIDs: []uint32{11, 22, 33}, + SelectedSegments: []backenddb.ValueLogRewritePlanSegment{ + {FileID: 11, BytesTotal: 64 << 20, BytesLive: 8 << 20, BytesStale: 56 << 20, StaleRatio: 0.875}, + {FileID: 22, BytesTotal: 64 << 20, BytesLive: 16 << 20, BytesStale: 48 << 20, StaleRatio: 0.75}, + {FileID: 33, BytesTotal: 64 << 20, BytesLive: 24 << 20, BytesStale: 40 << 20, StaleRatio: 0.625}, + }, + SegmentsTotal: 3, + SegmentsSelected: 3, + BytesTotal: 192 << 20, + BytesLive: 48 << 20, + BytesStale: 144 << 20, + SelectedBytesTotal: 192 << 20, + SelectedBytesLive: 48 << 20, + SelectedBytesStale: 144 << 20, + }, + rewriteResponse: backenddb.ValueLogRewriteStats{ + BytesBefore: 192 << 20, + BytesAfter: 48 << 20, + RecordsCopied: 3, + }, + } + + db, cleanup := openRewriteQueueTestDB(t, dir, recorder) + t.Cleanup(cleanup) + db.valueLogRewriteTriggerBytes = 0 + db.valueLogRewriteTriggerRatioPPM = 1 + db.valueLogGenerationHotTarget = 0 + db.vlogGenerationRewriteBudgetTokensBytes.Store(defaultVlogGenerationWarmTargetBytes * 4) + forceVlogMaintenanceIdle(db) + + if err := db.setVlogGenerationRewriteLedgerWithStage([]backenddb.ValueLogRewritePlanSegment{ + {FileID: 11, BytesTotal: 64 << 20, BytesLive: 8 << 20, BytesStale: 56 << 20, StaleRatio: 0.875}, + {FileID: 22, BytesTotal: 64 << 20, BytesLive: 16 << 20, BytesStale: 48 << 20, StaleRatio: 0.75}, + {FileID: 33, BytesTotal: 64 << 20, BytesLive: 24 << 20, BytesStale: 40 << 20, StaleRatio: 0.625}, + }, true, time.Now().Add(-vlogGenerationRewriteMinInterval-time.Second).UnixNano()); err != nil { + t.Fatalf("seed staged rewrite ledger: %v", err) + } + forceRewriteStageConfirmDue(t, db) + + db.maybeRunVlogGenerationMaintenanceWithOptions(false, vlogGenerationMaintenanceOptions{ + bypassQuiet: true, + skipRetainedPruneWait: true, + skipCheckpoint: false, + rewriteDebtDrain: true, + debugSource: "rewrite_stage_confirm", + }) + + rewriteOpts, rewriteCalls := recorder.recordedRewrite() + if rewriteCalls != 1 { + t.Fatalf("rewrite calls after staged confirmation=%d want=1", rewriteCalls) + } + if got := len(rewriteOpts.SourceFileIDs); got <= 1 { + t.Fatalf("rewrite SourceFileIDs after staged confirmation=%v want multiple ids", rewriteOpts.SourceFileIDs) + } + if got := len(rewriteOpts.SourceFileIDs); got > vlogGenerationRewriteDebtDrainMaxSegments { + t.Fatalf("rewrite SourceFileIDs len=%d want <= %d", got, vlogGenerationRewriteDebtDrainMaxSegments) + } +} + func TestVlogGenerationRewritePlan_StageConfirmationReplansEvenWhenOtherTriggersFire(t *testing.T) { prepareDirectSchedulerTest(t) diff --git a/worklog/2026-03-27.md b/worklog/2026-03-27.md index 29c5b7b66..6e490ca7a 100644 --- a/worklog/2026-03-27.md +++ b/worklog/2026-03-27.md @@ -236,3 +236,59 @@ - post: `du -sb application.db = 2077350273` - interpretation: - live run still leaves substantial reclaimable headroom; new counters indicate confirmation-gated debt progression as one concrete limiter. + +- Stage-confirm rewrite progression experiment (post-observability): + - hypothesis: + - live rewrite debt was bottlenecked by stage-confirm overlap collapse + single-segment execution, visible as `plan_selected_segments_total > rewrite.exec.source_segments_total`. + - code changes: + - `TreeDB/caching/db.go` + - `vlogGenerationRewriteMaxSegmentsForRun`: + - keep checkpoint-kick (`bypassQuiet && !skipCheckpoint`) capped to single-segment, + - but allow stage-confirm / age-blocked deferred sources to use bounded debt-drain sizing. + - rewrite execution path when `haveRewritePlan`: + - allow debt-drain sizing for confirmed `rewrite_resume` plans instead of forcing single-segment. + - stale-ratio staged confirmation handling: + - once overlap confirms stability, execute current sparse plan rather than filtering to overlap-only subset. + - `TreeDB/caching/vlog_generation_scheduler_test.go` + - added `TestVlogGenerationRewritePlan_StageConfirmationDebtDrainProcessesMultipleSegments`. + +- Validation: + - `go test ./TreeDB/caching -run 'TestVlogGenerationRewritePlan_StageConfirmationDebtDrainProcessesMultipleSegments|TestVlogGenerationRewriteQueue_CheckpointKickDebtDrainCapsSingleSegment|TestVlogGenerationRewriteQueue_DebtDrainProcessesMultipleSegments|TestVlogGenerationRewritePlan_StageConfirmationExecutesConfirmedSubset' -count=1` + - `go test ./TreeDB/caching -count=1` + - `go test ./TreeDB -count=1` + +- `run_celestia` comparison (same profile/height target): + - command: + - `LOCAL_GOMAP_DIR=/home/mikers/dev/snissn/gomap-phasehook-active TREEDB_OPEN_PROFILE=fast FREEZE_REMOTE_HEIGHT_AT_START=1 STOP_AT_LOCAL_HEIGHT=5000 ~/run_celestia.sh` + - pre-change reference home: + - `/home/mikers/.celestia-app-mainnet-treedb-20260327194254` + - final snapshot: `pprof-heap-max-rss-final-6937148k-20260327194740.treedb_vars.json` + - key counters: + - `plan_selected_segments_total=3` + - `rewrite.exec.source_segments_total=1` + - `rewrite.runs=1` + - `rewrite.bytes_in=33081912` + - `rewrite.reclaimed_bytes=0` + - post-change run home: + - `/home/mikers/.celestia-app-mainnet-treedb-20260327195053` + - final snapshot: `pprof-heap-max-rss-7548312k-20260327195551.treedb_vars.json` + - key counters: + - `plan_selected_segments_total=3` + - `rewrite.exec.source_segments_total=2` + - `rewrite.runs=1` + - `rewrite.bytes_in=38276046` + - `rewrite.reclaimed_bytes=0` + +- offline rewrite deltas (headroom remains large): + - pre-change home `/home/mikers/.celestia-app-mainnet-treedb-20260327194254`: + - pre: `4645103594` + - post: `2016029251` + - tool output: `segments_before=20 segments_after=15 bytes_before=4583082964 bytes_after=1978124746 records=956586` + - post-change home `/home/mikers/.celestia-app-mainnet-treedb-20260327195053`: + - pre: `4653743667` + - post: `2022437394` + - tool output: `segments_before=20 segments_after=15 bytes_before=4598014513 bytes_after=1984532899 records=958463` + +- interpretation: + - stage-confirm policy change increased in-run rewritten source segments (`1 -> 2`) in a comparable 5000-height window. + - immediate live reclaim remains `0`, and offline compaction still cuts ~2.6 GiB, so major headroom remains. From 17a907ec3d3e627990d9b38de18384eac83e5d2c Mon Sep 17 00:00:00 2001 From: Mikers Date: Fri, 27 Mar 2026 20:24:57 -1000 Subject: [PATCH 09/61] treedb: add rewrite no-reclaim diagnostics --- TreeDB/caching/db.go | 22 +++++++++ .../caching/vlog_generation_scheduler_test.go | 16 +++++++ worklog/2026-03-27.md | 46 +++++++++++++++++++ 3 files changed, 84 insertions(+) diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go index 027e7ca12..1baf6be7d 100644 --- a/TreeDB/caching/db.go +++ b/TreeDB/caching/db.go @@ -5175,6 +5175,10 @@ type DB struct { vlogGenerationRewriteBytesIn atomic.Uint64 vlogGenerationRewriteBytesOut atomic.Uint64 vlogGenerationRewriteReclaimedBytes atomic.Uint64 + vlogGenerationRewriteProcessedLiveBytes atomic.Uint64 + vlogGenerationRewriteProcessedStaleBytes atomic.Uint64 + vlogGenerationRewriteNoReclaimRuns atomic.Uint64 + vlogGenerationRewriteNoReclaimStaleBytes atomic.Uint64 vlogGenerationRewriteRuns atomic.Uint64 vlogGenerationRewritePlanRuns atomic.Uint64 vlogGenerationRewritePlanCanceled atomic.Uint64 @@ -13964,6 +13968,14 @@ planned: db.vlogGenerationRewriteReclaimedBytes.Add(uint64(effectiveBytesBefore - effectiveBytesAfter)) } locallyEffectiveProcessedDebt := len(processedRewriteIDs) > 0 && processedLedgerOK && processedLedgerStaleBytes > 0 && stats.RecordsCopied > 0 + if processedLedgerOK { + if processedLedgerLiveBytes > 0 { + db.vlogGenerationRewriteProcessedLiveBytes.Add(uint64(processedLedgerLiveBytes)) + } + if processedLedgerStaleBytes > 0 { + db.vlogGenerationRewriteProcessedStaleBytes.Add(uint64(processedLedgerStaleBytes)) + } + } if effectiveBytesBefore > 0 && effectiveBytesAfter >= effectiveBytesBefore && !locallyEffectiveProcessedDebt { db.vlogGenerationRewriteIneffectiveRuns.Add(1) db.vlogGenerationRewriteIneffectiveBytesIn.Add(uint64(effectiveBytesBefore)) @@ -13994,6 +14006,12 @@ planned: } } if locallyEffectiveProcessedDebt { + if effectiveBytesAfter >= effectiveBytesBefore { + db.vlogGenerationRewriteNoReclaimRuns.Add(1) + if processedLedgerStaleBytes > 0 { + db.vlogGenerationRewriteNoReclaimStaleBytes.Add(uint64(processedLedgerStaleBytes)) + } + } db.debugVlogMaintf( "rewrite_effective_local reason=%s processed_ids=%d planned_total=%d planned_live=%d planned_stale=%d global_bytes_before=%d global_bytes_after=%d gc_bytes_deleted=%d records=%d", vlogGenerationReasonString(reason), @@ -19802,6 +19820,10 @@ func (db *DB) Stats() map[string]string { stats["treedb.cache.vlog_generation.segments.cold"] = fmt.Sprintf("%d", retained.SegmentsCold) stats["treedb.cache.vlog_generation.rewrite.bytes_in"] = fmt.Sprintf("%d", db.vlogGenerationRewriteBytesIn.Load()) stats["treedb.cache.vlog_generation.rewrite.bytes_out"] = fmt.Sprintf("%d", db.vlogGenerationRewriteBytesOut.Load()) + stats["treedb.cache.vlog_generation.rewrite.processed_live_bytes"] = fmt.Sprintf("%d", db.vlogGenerationRewriteProcessedLiveBytes.Load()) + stats["treedb.cache.vlog_generation.rewrite.processed_stale_bytes"] = fmt.Sprintf("%d", db.vlogGenerationRewriteProcessedStaleBytes.Load()) + stats["treedb.cache.vlog_generation.rewrite.no_reclaim_runs"] = fmt.Sprintf("%d", db.vlogGenerationRewriteNoReclaimRuns.Load()) + stats["treedb.cache.vlog_generation.rewrite.no_reclaim_stale_bytes"] = fmt.Sprintf("%d", db.vlogGenerationRewriteNoReclaimStaleBytes.Load()) stats["treedb.cache.vlog_generation.rewrite.runs"] = fmt.Sprintf("%d", db.vlogGenerationRewriteRuns.Load()) stats["treedb.cache.vlog_generation.rewrite.plan_runs"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanRuns.Load()) stats["treedb.cache.vlog_generation.rewrite.plan_canceled"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanCanceled.Load()) diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go index 99f715ef1..230f09573 100644 --- a/TreeDB/caching/vlog_generation_scheduler_test.go +++ b/TreeDB/caching/vlog_generation_scheduler_test.go @@ -5717,6 +5717,10 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) { db.vlogGenerationMaintenanceSkipStageDue.Store(2) db.vlogGenerationRewritePlanSelectedSegments.Store(6) db.vlogGenerationRewriteExecSourceSegments.Store(3) + db.vlogGenerationRewriteProcessedLiveBytes.Store(900) + db.vlogGenerationRewriteProcessedStaleBytes.Store(450) + db.vlogGenerationRewriteNoReclaimRuns.Store(3) + db.vlogGenerationRewriteNoReclaimStaleBytes.Store(320) db.vlogGenerationRewriteQueueMu.Lock() db.vlogGenerationRewriteQueueLoaded = true @@ -5817,4 +5821,16 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) { if got := stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_total"]; got != "3" { t.Fatalf("rewrite exec source segments total=%q want 3", got) } + if got := stats["treedb.cache.vlog_generation.rewrite.processed_live_bytes"]; got != "900" { + t.Fatalf("rewrite processed live bytes=%q want 900", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.processed_stale_bytes"]; got != "450" { + t.Fatalf("rewrite processed stale bytes=%q want 450", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.no_reclaim_runs"]; got != "3" { + t.Fatalf("rewrite no reclaim runs=%q want 3", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.no_reclaim_stale_bytes"]; got != "320" { + t.Fatalf("rewrite no reclaim stale bytes=%q want 320", got) + } } diff --git a/worklog/2026-03-27.md b/worklog/2026-03-27.md index 6e490ca7a..9068195d2 100644 --- a/worklog/2026-03-27.md +++ b/worklog/2026-03-27.md @@ -292,3 +292,49 @@ - interpretation: - stage-confirm policy change increased in-run rewritten source segments (`1 -> 2`) in a comparable 5000-height window. - immediate live reclaim remains `0`, and offline compaction still cuts ~2.6 GiB, so major headroom remains. + +- No-reclaim diagnostics instrumentation for live rewrite: + - code changes: + - `TreeDB/caching/db.go` + - added rewrite economics counters: + - `treedb.cache.vlog_generation.rewrite.processed_live_bytes` + - `treedb.cache.vlog_generation.rewrite.processed_stale_bytes` + - `treedb.cache.vlog_generation.rewrite.no_reclaim_runs` + - `treedb.cache.vlog_generation.rewrite.no_reclaim_stale_bytes` + - counters update in rewrite execution path: + - accumulate processed live/stale bytes from processed ledger chunk + - mark `no_reclaim_runs` when rewrite copied stale debt but global bytes did not fall in-pass + - `TreeDB/caching/vlog_generation_scheduler_test.go` + - extended `TestVlogGenerationStats_ReportRewriteBacklogAndDurations` assertions for new keys. + +- Validation: + - `go test ./TreeDB/caching -run 'TestVlogGenerationStats_ReportRewriteBacklogAndDurations|TestVlogGenerationRewritePlan_StageConfirmationDebtDrainProcessesMultipleSegments|TestVlogGenerationRewriteQueue_CheckpointKickDebtDrainCapsSingleSegment' -count=1` + - `go test ./TreeDB/caching -count=1` + - `go test ./TreeDB -count=1` + +- `run_celestia` readout with new counters: + - command: + - `LOCAL_GOMAP_DIR=/home/mikers/dev/snissn/gomap-phasehook-active TREEDB_OPEN_PROFILE=fast FREEZE_REMOTE_HEIGHT_AT_START=1 STOP_AT_LOCAL_HEIGHT=5000 ~/run_celestia.sh` + - run home: + - `/home/mikers/.celestia-app-mainnet-treedb-20260327201801` + - final snapshot: + - `pprof-heap-max-rss-final-7767828k-20260327202312.treedb_vars.json` + - key counters: + - `plan_selected_segments_total=3` + - `rewrite.exec.source_segments_total=2` + - `rewrite.runs=1` + - `rewrite.bytes_in=38292854` + - `rewrite.processed_live_bytes=38292854` + - `rewrite.processed_stale_bytes=498581006` + - `rewrite.no_reclaim_runs=1` + - `rewrite.no_reclaim_stale_bytes=498581006` + - `rewrite.reclaimed_bytes=0` + - `gc.deleted_bytes=0` + +- interpretation: + - live rewrite now clearly reports that substantial stale payload was processed in-pass (~498 MiB) with zero immediate reclaim, confirming reclaim is blocked/deferred downstream of selection+copy. + +- offline rewrite sanity check for same run: + - pre: `4747763395` + - post: `2064528109` + - tool output: `segments_before=20 segments_after=16 bytes_before=4674175679 bytes_after=2026328485 records=963752` From b8d918682545e3c5b0102b6385f6a1392ae71854 Mon Sep 17 00:00:00 2001 From: Mikers Date: Fri, 27 Mar 2026 20:50:28 -1000 Subject: [PATCH 10/61] treedb: export gc blocker classification stats --- TreeDB/caching/db.go | 44 +++++++++++++++++ .../caching/vlog_generation_scheduler_test.go | 48 +++++++++++++++++++ TreeDB/db/vlog_gc.go | 8 ++++ worklog/2026-03-27.md | 27 +++++++++++ 4 files changed, 127 insertions(+) diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go index 1baf6be7d..f26ae16eb 100644 --- a/TreeDB/caching/db.go +++ b/TreeDB/caching/db.go @@ -5216,6 +5216,18 @@ type DB struct { vlogGenerationLastGCDryRunUnixNano atomic.Int64 vlogGenerationLastGCDryRunBytesEligible atomic.Int64 vlogGenerationLastGCDryRunSegsEligible atomic.Int64 + vlogGenerationLastGCBytesReferenced atomic.Int64 + vlogGenerationLastGCSegmentsReferenced atomic.Int64 + vlogGenerationLastGCBytesActive atomic.Int64 + vlogGenerationLastGCSegmentsActive atomic.Int64 + vlogGenerationLastGCBytesProtected atomic.Int64 + vlogGenerationLastGCSegmentsProtected atomic.Int64 + vlogGenerationLastGCBytesEligible atomic.Int64 + vlogGenerationLastGCSegmentsEligible atomic.Int64 + vlogGenerationLastGCBytesDeleted atomic.Int64 + vlogGenerationLastGCSegmentsDeleted atomic.Int64 + vlogGenerationLastGCBytesPending atomic.Int64 + vlogGenerationLastGCSegmentsPending atomic.Int64 vlogGenerationChurnBytes atomic.Uint64 vlogGenerationSchedulerState atomic.Uint32 vlogGenerationMaintenanceActive atomic.Bool @@ -12497,6 +12509,24 @@ func (db *DB) observeVlogGenerationGCExecDuration(d time.Duration) { observeDurationNanos(&db.vlogGenerationGCExecTotalNanos, &db.vlogGenerationGCExecMaxNanos, d) } +func (db *DB) observeVlogGenerationGCStats(stats backenddb.ValueLogGCStats) { + if db == nil { + return + } + db.vlogGenerationLastGCBytesReferenced.Store(stats.BytesReferenced) + db.vlogGenerationLastGCSegmentsReferenced.Store(int64(stats.SegmentsReferenced)) + db.vlogGenerationLastGCBytesActive.Store(stats.BytesActive) + db.vlogGenerationLastGCSegmentsActive.Store(int64(stats.SegmentsActive)) + db.vlogGenerationLastGCBytesProtected.Store(stats.BytesProtected) + db.vlogGenerationLastGCSegmentsProtected.Store(int64(stats.SegmentsProtected)) + db.vlogGenerationLastGCBytesEligible.Store(stats.BytesEligible) + db.vlogGenerationLastGCSegmentsEligible.Store(int64(stats.SegmentsEligible)) + db.vlogGenerationLastGCBytesDeleted.Store(stats.BytesDeleted) + db.vlogGenerationLastGCSegmentsDeleted.Store(int64(stats.SegmentsDeleted)) + db.vlogGenerationLastGCBytesPending.Store(stats.BytesPending) + db.vlogGenerationLastGCSegmentsPending.Store(int64(stats.SegmentsPending)) +} + func (db *DB) observeVlogGenerationVacuumExecDuration(d time.Duration) { if db == nil { return @@ -13955,6 +13985,7 @@ planned: db.debugVlogMaintf("gc_after_rewrite_err reason=%s err=%v dur_ms=%.3f", vlogGenerationReasonString(reason), gcErr, float64(gcDur.Microseconds())/1000) return fmt.Errorf("generational gc after rewrite: %w", gcErr) } + db.observeVlogGenerationGCStats(gcStats) if gcStats.BytesDeleted > 0 { gcBytesDeleted = int64(gcStats.BytesDeleted) effectiveBytesAfter -= gcBytesDeleted @@ -14163,6 +14194,7 @@ planned: if err != nil { return fmt.Errorf("generational gc: %w", err) } + db.observeVlogGenerationGCStats(gcStats) db.vlogGenerationSchedulerState.Store(vlogGenerationSchedulerIdle) db.vlogGenerationGCRuns.Add(1) if gcStats.SegmentsDeleted > 0 { @@ -19864,6 +19896,18 @@ func (db *DB) Stats() map[string]string { stats["treedb.cache.vlog_generation.rewrite.last_unix_nano"] = fmt.Sprintf("%d", db.vlogGenerationLastRewriteUnixNano.Load()) stats["treedb.cache.vlog_generation.gc.deleted_segments"] = fmt.Sprintf("%d", db.vlogGenerationGCSegmentsDeleted.Load()) stats["treedb.cache.vlog_generation.gc.deleted_bytes"] = fmt.Sprintf("%d", db.vlogGenerationGCBytesDeleted.Load()) + stats["treedb.cache.vlog_generation.gc.last_referenced_segments"] = fmt.Sprintf("%d", db.vlogGenerationLastGCSegmentsReferenced.Load()) + stats["treedb.cache.vlog_generation.gc.last_referenced_bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCBytesReferenced.Load()) + stats["treedb.cache.vlog_generation.gc.last_active_segments"] = fmt.Sprintf("%d", db.vlogGenerationLastGCSegmentsActive.Load()) + stats["treedb.cache.vlog_generation.gc.last_active_bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCBytesActive.Load()) + stats["treedb.cache.vlog_generation.gc.last_protected_segments"] = fmt.Sprintf("%d", db.vlogGenerationLastGCSegmentsProtected.Load()) + stats["treedb.cache.vlog_generation.gc.last_protected_bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCBytesProtected.Load()) + stats["treedb.cache.vlog_generation.gc.last_eligible_segments"] = fmt.Sprintf("%d", db.vlogGenerationLastGCSegmentsEligible.Load()) + stats["treedb.cache.vlog_generation.gc.last_eligible_bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCBytesEligible.Load()) + stats["treedb.cache.vlog_generation.gc.last_deleted_segments"] = fmt.Sprintf("%d", db.vlogGenerationLastGCSegmentsDeleted.Load()) + stats["treedb.cache.vlog_generation.gc.last_deleted_bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCBytesDeleted.Load()) + stats["treedb.cache.vlog_generation.gc.last_pending_segments"] = fmt.Sprintf("%d", db.vlogGenerationLastGCSegmentsPending.Load()) + stats["treedb.cache.vlog_generation.gc.last_pending_bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCBytesPending.Load()) stats["treedb.cache.vlog_generation.gc.runs"] = fmt.Sprintf("%d", db.vlogGenerationGCRuns.Load()) stats["treedb.cache.vlog_generation.gc.exec.total_ms"] = fmt.Sprintf("%.3f", float64(gcExecTotalNS)/float64(time.Millisecond)) stats["treedb.cache.vlog_generation.gc.exec.max_ms"] = fmt.Sprintf("%.3f", float64(gcExecMaxNS)/float64(time.Millisecond)) diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go index 230f09573..7c75f7fd2 100644 --- a/TreeDB/caching/vlog_generation_scheduler_test.go +++ b/TreeDB/caching/vlog_generation_scheduler_test.go @@ -5713,6 +5713,18 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) { db.vlogGenerationRewriteBudgetTokensBytes.Store(512) db.vlogGenerationRewriteBudgetConsumed.Store(1536) db.vlogGenerationRewriteAgeBlockedUntilNS.Store(time.Now().Add(5 * time.Second).UnixNano()) + db.vlogGenerationLastGCSegmentsReferenced.Store(7) + db.vlogGenerationLastGCBytesReferenced.Store(700) + db.vlogGenerationLastGCSegmentsActive.Store(4) + db.vlogGenerationLastGCBytesActive.Store(400) + db.vlogGenerationLastGCSegmentsProtected.Store(3) + db.vlogGenerationLastGCBytesProtected.Store(300) + db.vlogGenerationLastGCSegmentsEligible.Store(6) + db.vlogGenerationLastGCBytesEligible.Store(600) + db.vlogGenerationLastGCSegmentsDeleted.Store(2) + db.vlogGenerationLastGCBytesDeleted.Store(200) + db.vlogGenerationLastGCSegmentsPending.Store(4) + db.vlogGenerationLastGCBytesPending.Store(400) db.vlogGenerationMaintenanceSkipStageNotDue.Store(5) db.vlogGenerationMaintenanceSkipStageDue.Store(2) db.vlogGenerationRewritePlanSelectedSegments.Store(6) @@ -5764,6 +5776,42 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) { if got := stats["treedb.cache.vlog_generation.gc.exec.avg_ms"]; got != "30.000" { t.Fatalf("gc exec avg ms=%q want 30.000", got) } + if got := stats["treedb.cache.vlog_generation.gc.last_referenced_segments"]; got != "7" { + t.Fatalf("gc last referenced segments=%q want 7", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_referenced_bytes"]; got != "700" { + t.Fatalf("gc last referenced bytes=%q want 700", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_active_segments"]; got != "4" { + t.Fatalf("gc last active segments=%q want 4", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_active_bytes"]; got != "400" { + t.Fatalf("gc last active bytes=%q want 400", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_protected_segments"]; got != "3" { + t.Fatalf("gc last protected segments=%q want 3", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_protected_bytes"]; got != "300" { + t.Fatalf("gc last protected bytes=%q want 300", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_eligible_segments"]; got != "6" { + t.Fatalf("gc last eligible segments=%q want 6", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_eligible_bytes"]; got != "600" { + t.Fatalf("gc last eligible bytes=%q want 600", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_deleted_segments"]; got != "2" { + t.Fatalf("gc last deleted segments=%q want 2", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_deleted_bytes"]; got != "200" { + t.Fatalf("gc last deleted bytes=%q want 200", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_pending_segments"]; got != "4" { + t.Fatalf("gc last pending segments=%q want 4", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_pending_bytes"]; got != "400" { + t.Fatalf("gc last pending bytes=%q want 400", got) + } if got := stats["treedb.cache.vlog_generation.vacuum.exec.total_ms"]; got != "44.000" { t.Fatalf("vacuum exec total ms=%q want 44.000", got) } diff --git a/TreeDB/db/vlog_gc.go b/TreeDB/db/vlog_gc.go index d67d0aaf1..19ac1d43a 100644 --- a/TreeDB/db/vlog_gc.go +++ b/TreeDB/db/vlog_gc.go @@ -29,12 +29,14 @@ type ValueLogGCStats struct { SegmentsProtected int SegmentsEligible int SegmentsDeleted int + SegmentsPending int BytesTotal int64 BytesReferenced int64 BytesActive int64 BytesProtected int64 BytesEligible int64 BytesDeleted int64 + BytesPending int64 } // ValueLogGC deletes fully-unreferenced value-log segments. @@ -164,6 +166,12 @@ func (db *DB) ValueLogGC(ctx context.Context, opts ValueLogGCOptions) (ValueLogG } } } + if stats.SegmentsEligible > stats.SegmentsDeleted { + stats.SegmentsPending = stats.SegmentsEligible - stats.SegmentsDeleted + } + if stats.BytesEligible > stats.BytesDeleted { + stats.BytesPending = stats.BytesEligible - stats.BytesDeleted + } currentSet := vm.CurrentSetNoRefresh() if currentSet != nil { diff --git a/worklog/2026-03-27.md b/worklog/2026-03-27.md index 9068195d2..9b357c3b1 100644 --- a/worklog/2026-03-27.md +++ b/worklog/2026-03-27.md @@ -338,3 +338,30 @@ - pre: `4747763395` - post: `2064528109` - tool output: `segments_before=20 segments_after=16 bytes_before=4674175679 bytes_after=2026328485 records=963752` + +- GC blocker classification instrumentation (follow-up to no-reclaim counters): + - goal: + - make no-reclaim episodes diagnosable in one snapshot by showing whether bytes are blocked by active/pinned/protected classes vs actually eligible but pending delete. + - code changes: + - `TreeDB/db/vlog_gc.go` + - extended `ValueLogGCStats` with: + - `SegmentsPending` + - `BytesPending` + - populated pending values after delete attempts as `eligible - deleted` when positive. + - `TreeDB/caching/db.go` + - added cached per-run GC classification fields to `DB` atomics. + - added `observeVlogGenerationGCStats(...)` and wired it into both: + - post-rewrite GC pass + - periodic GC pass + - exported new stats keys: + - `treedb.cache.vlog_generation.gc.last_referenced_segments/bytes` + - `treedb.cache.vlog_generation.gc.last_active_segments/bytes` + - `treedb.cache.vlog_generation.gc.last_protected_segments/bytes` + - `treedb.cache.vlog_generation.gc.last_eligible_segments/bytes` + - `treedb.cache.vlog_generation.gc.last_deleted_segments/bytes` + - `treedb.cache.vlog_generation.gc.last_pending_segments/bytes` + - `TreeDB/caching/vlog_generation_scheduler_test.go` + - extended `TestVlogGenerationStats_ReportRewriteBacklogAndDurations` with assertions for all new keys. + +- Validation: + - `go test ./TreeDB/db ./TreeDB/caching -count=1` From 29ec3716034258431acbd6b991888587e6215f4f Mon Sep 17 00:00:00 2001 From: Mikers Date: Fri, 27 Mar 2026 21:01:09 -1000 Subject: [PATCH 11/61] worklog: record gc blocker readout from run_celestia --- worklog/2026-03-27.md | 53 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/worklog/2026-03-27.md b/worklog/2026-03-27.md index 9b357c3b1..b9725f86f 100644 --- a/worklog/2026-03-27.md +++ b/worklog/2026-03-27.md @@ -365,3 +365,56 @@ - Validation: - `go test ./TreeDB/db ./TreeDB/caching -count=1` + +- Corrected `run_celestia` validation after adding `gc.last_*` stats: + - initial rerun with `STOP_AT_LOCAL_HEIGHT=5000` was invalid for maintenance analysis because the script treats it as an absolute local-height target; after state sync jump to ~10.4M, it exited immediately with no rewrite activity. + - corrected run command: + - `LOCAL_GOMAP_DIR=/home/mikers/dev/snissn/gomap-phasehook-active TREEDB_OPEN_PROFILE=fast FREEZE_REMOTE_HEIGHT_AT_START=1 ~/run_celestia.sh` + - run home: + - `/home/mikers/.celestia-app-mainnet-treedb-20260327205149` + - note on diagnostics source: + - per-instance application counters were read from `*.debug_vars.json` at: + - `.treedb.instances[".../data/application.db/maindb/wal#..."]` + - `*.treedb_application_vars.json` was `{}` in this run, so instance readout is the reliable source. + +- Final application-instance counters (`pprof-heap-max-rss-final-11027988k-20260327205709.debug_vars.json`): + - rewrite: + - `rewrite.plan_selected_segments_total=3` + - `rewrite.exec.source_segments_total=2` + - `rewrite.runs=1` + - `rewrite.bytes_in=38292854` + - `rewrite.processed_live_bytes=38292854` + - `rewrite.processed_stale_bytes=498582443` + - `rewrite.no_reclaim_runs=1` + - `rewrite.no_reclaim_stale_bytes=498582443` + - `rewrite.reclaimed_bytes=0` + - gc/classification: + - `gc.runs=1` + - `gc.deleted_bytes=0` + - `gc.last_referenced_segments=8` + - `gc.last_referenced_bytes=1294769679` + - `gc.last_active_segments=0` + - `gc.last_active_bytes=0` + - `gc.last_protected_segments=2` + - `gc.last_protected_bytes=536875297` + - `gc.last_eligible_segments=0` + - `gc.last_eligible_bytes=0` + - `gc.last_deleted_segments=0` + - `gc.last_deleted_bytes=0` + - `gc.last_pending_segments=0` + - `gc.last_pending_bytes=0` + - maintenance: + - `maintenance.attempts=35` + - `maintenance.acquired=35` + - `maintenance.collisions=0` + +- Interpretation: + - this run confirms stale bytes are being copied by live rewrite, but immediate reclaim is blocked because the final GC view reports `eligible=0` (not delete failure/pending). + - blocker class in this sample is dominated by `referenced + protected` bytes, not active segment pinning and not eligible-but-pending deletion. + +- Offline reclaim headroom on the same run home: + - command: + - `/home/mikers/dev/snissn/celestia-app-p4/build/treemap-local vlog-rewrite /home/mikers/.celestia-app-mainnet-treedb-20260327205149/data/application.db -rw` + - pre: `5035136550` + - tool output: `segments_before=21 segments_after=16 bytes_before=4888181282 bytes_after=2076143228 records=985926` + - post: `2115096516` From aebe8037c3c7984f97a1d94f1aea73d02f630d02 Mon Sep 17 00:00:00 2001 From: Mikers Date: Fri, 27 Mar 2026 21:20:11 -1000 Subject: [PATCH 12/61] treedb: split gc protected blockers by class --- TreeDB/caching/db.go | 351 ++++++++++-------- .../caching/vlog_generation_scheduler_test.go | 32 ++ TreeDB/db/vlog_gc.go | 103 ++++- TreeDB/db/vlog_gc_test.go | 71 ++++ worklog/2026-03-27.md | 72 ++++ 5 files changed, 449 insertions(+), 180 deletions(-) diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go index f26ae16eb..e5ed17eba 100644 --- a/TreeDB/caching/db.go +++ b/TreeDB/caching/db.go @@ -3864,38 +3864,44 @@ func (db *DB) ValueLogRetainedPaths() []string { return db.valueLogRetainedPaths() } -func (db *DB) valueLogProtectedPaths() []string { - retained := db.valueLogRetainedPaths() - inUse := db.valueLogInUsePaths() - if len(retained) == 0 { - return inUse - } - if len(inUse) == 0 { - return retained - } - seen := make(map[string]struct{}, len(retained)+len(inUse)) - paths := make([]string, 0, len(retained)+len(inUse)) - for _, path := range retained { - if path == "" { - continue - } - if _, ok := seen[path]; ok { - continue +func mergeUniqueNonEmptyStrings(pathSets ...[]string) []string { + seen := make(map[string]struct{}) + var out []string + for _, paths := range pathSets { + for _, path := range paths { + if path == "" { + continue + } + if _, ok := seen[path]; ok { + continue + } + seen[path] = struct{}{} + out = append(out, path) } - seen[path] = struct{}{} - paths = append(paths, path) } - for _, path := range inUse { - if path == "" { - continue - } - if _, ok := seen[path]; ok { - continue - } - seen[path] = struct{}{} - paths = append(paths, path) + return out +} + +func (db *DB) valueLogGCProtectedPathSets() (retained []string, inUse []string, merged []string) { + retained = db.valueLogRetainedPaths() + inUse = db.valueLogInUsePaths() + merged = mergeUniqueNonEmptyStrings(retained, inUse) + return retained, inUse, merged +} + +func (db *DB) valueLogProtectedPaths() []string { + _, _, merged := db.valueLogGCProtectedPathSets() + return merged +} + +func (db *DB) valueLogGCOptions(dryRun bool) backenddb.ValueLogGCOptions { + retained, inUse, merged := db.valueLogGCProtectedPathSets() + return backenddb.ValueLogGCOptions{ + DryRun: dryRun, + ProtectedPaths: merged, + ProtectedInUsePaths: inUse, + ProtectedRetainedPaths: retained, } - return paths } // valueLogInUsePaths returns a best-effort snapshot of value-log segment paths @@ -5142,132 +5148,140 @@ type DB struct { valueLogMaxSegmentBytes int64 journalCompression bool - disableJournal bool - relaxedSync bool - notifyError func(error) - debugFlushPointers bool - debugFlushTiming bool - debugPtrEligible atomic.Int64 - debugPtrUsed atomic.Int64 - debugPtrNoPtr atomic.Int64 - debugPtrDenied atomic.Int64 - debugPtrDisabled atomic.Int64 - checkpointRuns atomic.Uint64 - checkpointTotalNs atomic.Uint64 - checkpointMaxNs atomic.Uint64 - checkpointNoopSkips atomic.Uint64 - checkpointFlushMuWaitNs atomic.Uint64 - checkpointFlushMuWaitMaxNs atomic.Uint64 - checkpointAutoVacuumRuns atomic.Uint64 - checkpointAutoVacuumLastCheckRun atomic.Uint64 - checkpointAutoVacuumLastPages atomic.Uint64 - checkpointAutoVacuumLastInternalP50 atomic.Uint64 - checkpointAutoVacuumLastInternalAvg atomic.Uint64 - lastForegroundWriteUnixNano atomic.Int64 - lastForegroundReadUnixNano atomic.Int64 - foregroundReadStampCounter atomic.Uint32 - activeForegroundIterators atomic.Int64 - retainedPruneLastStartUnixNano atomic.Int64 - retainedPruneMu sync.Mutex - retainedPruneDone chan struct{} - vlogGenerationRemapSuccesses atomic.Uint64 - vlogGenerationRemapFailures atomic.Uint64 - vlogGenerationRewriteBytesIn atomic.Uint64 - vlogGenerationRewriteBytesOut atomic.Uint64 - vlogGenerationRewriteReclaimedBytes atomic.Uint64 - vlogGenerationRewriteProcessedLiveBytes atomic.Uint64 - vlogGenerationRewriteProcessedStaleBytes atomic.Uint64 - vlogGenerationRewriteNoReclaimRuns atomic.Uint64 - vlogGenerationRewriteNoReclaimStaleBytes atomic.Uint64 - vlogGenerationRewriteRuns atomic.Uint64 - vlogGenerationRewritePlanRuns atomic.Uint64 - vlogGenerationRewritePlanCanceled atomic.Uint64 - vlogGenerationRewritePlanErrors atomic.Uint64 - vlogGenerationRewritePlanEmpty atomic.Uint64 - vlogGenerationRewritePlanSelected atomic.Uint64 - vlogGenerationRewritePlanSelectedSegments atomic.Uint64 - vlogGenerationRewritePlanSelectedBytes atomic.Uint64 - vlogGenerationRewritePlanSelectedLiveBytes atomic.Uint64 - vlogGenerationRewritePlanSelectedStaleBytes atomic.Uint64 - vlogGenerationRewritePlanCanceledLastNS atomic.Int64 - vlogGenerationRewriteAgeBlockedUntilNS atomic.Int64 - vlogGenerationRewriteAgeBlockedWakeRunning atomic.Bool - vlogGenerationRewriteIneffectiveLastNS atomic.Int64 - vlogGenerationRewriteIneffectiveRuns atomic.Uint64 - vlogGenerationRewriteIneffectiveBytesIn atomic.Uint64 - vlogGenerationRewriteIneffectiveBytesOut atomic.Uint64 - vlogGenerationRewriteCanceledRuns atomic.Uint64 - vlogGenerationRewriteCanceledLastNS atomic.Int64 - vlogGenerationRewriteQueuePruneRuns atomic.Uint64 - vlogGenerationRewriteQueuePruneIDs atomic.Uint64 - vlogGenerationGCSegmentsDeleted atomic.Uint64 - vlogGenerationGCBytesDeleted atomic.Uint64 - vlogGenerationGCRuns atomic.Uint64 - vlogGenerationVacuumRuns atomic.Uint64 - vlogGenerationVacuumFailures atomic.Uint64 - vlogGenerationVacuumSkippedDisabled atomic.Uint64 - vlogGenerationVacuumSkippedRewriteBytes atomic.Uint64 - vlogGenerationVacuumSkippedCooldown atomic.Uint64 - vlogGenerationLastVacuumUnixNano atomic.Int64 - vlogGenerationLastRewritePlanUnixNano atomic.Int64 - vlogGenerationLastRewriteUnixNano atomic.Int64 - vlogGenerationLastGCUnixNano atomic.Int64 - vlogGenerationLastCheckpointKickUnixNano atomic.Int64 - vlogGenerationLastGCDryRunUnixNano atomic.Int64 - vlogGenerationLastGCDryRunBytesEligible atomic.Int64 - vlogGenerationLastGCDryRunSegsEligible atomic.Int64 - vlogGenerationLastGCBytesReferenced atomic.Int64 - vlogGenerationLastGCSegmentsReferenced atomic.Int64 - vlogGenerationLastGCBytesActive atomic.Int64 - vlogGenerationLastGCSegmentsActive atomic.Int64 - vlogGenerationLastGCBytesProtected atomic.Int64 - vlogGenerationLastGCSegmentsProtected atomic.Int64 - vlogGenerationLastGCBytesEligible atomic.Int64 - vlogGenerationLastGCSegmentsEligible atomic.Int64 - vlogGenerationLastGCBytesDeleted atomic.Int64 - vlogGenerationLastGCSegmentsDeleted atomic.Int64 - vlogGenerationLastGCBytesPending atomic.Int64 - vlogGenerationLastGCSegmentsPending atomic.Int64 - vlogGenerationChurnBytes atomic.Uint64 - vlogGenerationSchedulerState atomic.Uint32 - vlogGenerationMaintenanceActive atomic.Bool - vlogGenerationMaintenanceAttempts atomic.Uint64 - vlogGenerationMaintenanceAcquired atomic.Uint64 - vlogGenerationMaintenanceCollisions atomic.Uint64 - vlogGenerationMaintenanceSkipWALOnPeriodic atomic.Uint64 - vlogGenerationMaintenanceSkipPhase atomic.Uint64 - vlogGenerationMaintenanceSkipStageGate atomic.Uint64 - vlogGenerationMaintenanceSkipStageNotDue atomic.Uint64 - vlogGenerationMaintenanceSkipStageDue atomic.Uint64 - vlogGenerationMaintenanceSkipAgeBlocked atomic.Uint64 - vlogGenerationMaintenanceSkipPriority atomic.Uint64 - vlogGenerationMaintenanceSkipQuiet atomic.Uint64 - vlogGenerationMaintenanceSkipPreCheckpoint atomic.Uint64 - vlogGenerationMaintenanceSkipCheckpointing atomic.Uint64 - vlogGenerationMaintenancePassNoop atomic.Uint64 - vlogGenerationMaintenancePassWithRewrite atomic.Uint64 - vlogGenerationMaintenancePassWithGC atomic.Uint64 - vlogGenerationMaintenancePassTotalNanos atomic.Uint64 - vlogGenerationMaintenancePassMaxNanos atomic.Uint64 - vlogGenerationLastReason atomic.Uint32 - vlogGenerationCheckpointKickRuns atomic.Uint64 - vlogGenerationCheckpointKickRewriteRuns atomic.Uint64 - vlogGenerationCheckpointKickGCRuns atomic.Uint64 - vlogGenerationCheckpointKickPending atomic.Bool - vlogGenerationDeferredMaintenancePending atomic.Bool - vlogGenerationDeferredMaintenanceRunning atomic.Bool - vlogGenerationRewriteStageWakeObservedNS atomic.Int64 - vlogGenerationRewriteQueueMu sync.Mutex - vlogGenerationCheckpointKickActive atomic.Bool - vlogGenerationRewriteQueue []uint32 - vlogGenerationRewriteLedger []backenddb.ValueLogRewritePlanSegment - vlogGenerationRewritePenalties map[uint32]valueLogGenerationRewritePenalty - vlogGenerationRewriteStagePending bool - vlogGenerationRewriteStageObservedUnixNano int64 - vlogGenerationRewriteQueueLoaded bool - vlogGenerationLastChurnBps atomic.Int64 - vlogGenerationLastChurnSampleBytes atomic.Uint64 - vlogGenerationLastChurnSampleNS atomic.Int64 + disableJournal bool + relaxedSync bool + notifyError func(error) + debugFlushPointers bool + debugFlushTiming bool + debugPtrEligible atomic.Int64 + debugPtrUsed atomic.Int64 + debugPtrNoPtr atomic.Int64 + debugPtrDenied atomic.Int64 + debugPtrDisabled atomic.Int64 + checkpointRuns atomic.Uint64 + checkpointTotalNs atomic.Uint64 + checkpointMaxNs atomic.Uint64 + checkpointNoopSkips atomic.Uint64 + checkpointFlushMuWaitNs atomic.Uint64 + checkpointFlushMuWaitMaxNs atomic.Uint64 + checkpointAutoVacuumRuns atomic.Uint64 + checkpointAutoVacuumLastCheckRun atomic.Uint64 + checkpointAutoVacuumLastPages atomic.Uint64 + checkpointAutoVacuumLastInternalP50 atomic.Uint64 + checkpointAutoVacuumLastInternalAvg atomic.Uint64 + lastForegroundWriteUnixNano atomic.Int64 + lastForegroundReadUnixNano atomic.Int64 + foregroundReadStampCounter atomic.Uint32 + activeForegroundIterators atomic.Int64 + retainedPruneLastStartUnixNano atomic.Int64 + retainedPruneMu sync.Mutex + retainedPruneDone chan struct{} + vlogGenerationRemapSuccesses atomic.Uint64 + vlogGenerationRemapFailures atomic.Uint64 + vlogGenerationRewriteBytesIn atomic.Uint64 + vlogGenerationRewriteBytesOut atomic.Uint64 + vlogGenerationRewriteReclaimedBytes atomic.Uint64 + vlogGenerationRewriteProcessedLiveBytes atomic.Uint64 + vlogGenerationRewriteProcessedStaleBytes atomic.Uint64 + vlogGenerationRewriteNoReclaimRuns atomic.Uint64 + vlogGenerationRewriteNoReclaimStaleBytes atomic.Uint64 + vlogGenerationRewriteRuns atomic.Uint64 + vlogGenerationRewritePlanRuns atomic.Uint64 + vlogGenerationRewritePlanCanceled atomic.Uint64 + vlogGenerationRewritePlanErrors atomic.Uint64 + vlogGenerationRewritePlanEmpty atomic.Uint64 + vlogGenerationRewritePlanSelected atomic.Uint64 + vlogGenerationRewritePlanSelectedSegments atomic.Uint64 + vlogGenerationRewritePlanSelectedBytes atomic.Uint64 + vlogGenerationRewritePlanSelectedLiveBytes atomic.Uint64 + vlogGenerationRewritePlanSelectedStaleBytes atomic.Uint64 + vlogGenerationRewritePlanCanceledLastNS atomic.Int64 + vlogGenerationRewriteAgeBlockedUntilNS atomic.Int64 + vlogGenerationRewriteAgeBlockedWakeRunning atomic.Bool + vlogGenerationRewriteIneffectiveLastNS atomic.Int64 + vlogGenerationRewriteIneffectiveRuns atomic.Uint64 + vlogGenerationRewriteIneffectiveBytesIn atomic.Uint64 + vlogGenerationRewriteIneffectiveBytesOut atomic.Uint64 + vlogGenerationRewriteCanceledRuns atomic.Uint64 + vlogGenerationRewriteCanceledLastNS atomic.Int64 + vlogGenerationRewriteQueuePruneRuns atomic.Uint64 + vlogGenerationRewriteQueuePruneIDs atomic.Uint64 + vlogGenerationGCSegmentsDeleted atomic.Uint64 + vlogGenerationGCBytesDeleted atomic.Uint64 + vlogGenerationGCRuns atomic.Uint64 + vlogGenerationVacuumRuns atomic.Uint64 + vlogGenerationVacuumFailures atomic.Uint64 + vlogGenerationVacuumSkippedDisabled atomic.Uint64 + vlogGenerationVacuumSkippedRewriteBytes atomic.Uint64 + vlogGenerationVacuumSkippedCooldown atomic.Uint64 + vlogGenerationLastVacuumUnixNano atomic.Int64 + vlogGenerationLastRewritePlanUnixNano atomic.Int64 + vlogGenerationLastRewriteUnixNano atomic.Int64 + vlogGenerationLastGCUnixNano atomic.Int64 + vlogGenerationLastCheckpointKickUnixNano atomic.Int64 + vlogGenerationLastGCDryRunUnixNano atomic.Int64 + vlogGenerationLastGCDryRunBytesEligible atomic.Int64 + vlogGenerationLastGCDryRunSegsEligible atomic.Int64 + vlogGenerationLastGCBytesReferenced atomic.Int64 + vlogGenerationLastGCSegmentsReferenced atomic.Int64 + vlogGenerationLastGCBytesActive atomic.Int64 + vlogGenerationLastGCSegmentsActive atomic.Int64 + vlogGenerationLastGCBytesProtected atomic.Int64 + vlogGenerationLastGCSegmentsProtected atomic.Int64 + vlogGenerationLastGCBytesProtectedInUse atomic.Int64 + vlogGenerationLastGCSegmentsProtectedInUse atomic.Int64 + vlogGenerationLastGCBytesProtectedRetained atomic.Int64 + vlogGenerationLastGCSegmentsProtectedRetained atomic.Int64 + vlogGenerationLastGCBytesProtectedOverlap atomic.Int64 + vlogGenerationLastGCSegmentsProtectedOverlap atomic.Int64 + vlogGenerationLastGCBytesProtectedOther atomic.Int64 + vlogGenerationLastGCSegmentsProtectedOther atomic.Int64 + vlogGenerationLastGCBytesEligible atomic.Int64 + vlogGenerationLastGCSegmentsEligible atomic.Int64 + vlogGenerationLastGCBytesDeleted atomic.Int64 + vlogGenerationLastGCSegmentsDeleted atomic.Int64 + vlogGenerationLastGCBytesPending atomic.Int64 + vlogGenerationLastGCSegmentsPending atomic.Int64 + vlogGenerationChurnBytes atomic.Uint64 + vlogGenerationSchedulerState atomic.Uint32 + vlogGenerationMaintenanceActive atomic.Bool + vlogGenerationMaintenanceAttempts atomic.Uint64 + vlogGenerationMaintenanceAcquired atomic.Uint64 + vlogGenerationMaintenanceCollisions atomic.Uint64 + vlogGenerationMaintenanceSkipWALOnPeriodic atomic.Uint64 + vlogGenerationMaintenanceSkipPhase atomic.Uint64 + vlogGenerationMaintenanceSkipStageGate atomic.Uint64 + vlogGenerationMaintenanceSkipStageNotDue atomic.Uint64 + vlogGenerationMaintenanceSkipStageDue atomic.Uint64 + vlogGenerationMaintenanceSkipAgeBlocked atomic.Uint64 + vlogGenerationMaintenanceSkipPriority atomic.Uint64 + vlogGenerationMaintenanceSkipQuiet atomic.Uint64 + vlogGenerationMaintenanceSkipPreCheckpoint atomic.Uint64 + vlogGenerationMaintenanceSkipCheckpointing atomic.Uint64 + vlogGenerationMaintenancePassNoop atomic.Uint64 + vlogGenerationMaintenancePassWithRewrite atomic.Uint64 + vlogGenerationMaintenancePassWithGC atomic.Uint64 + vlogGenerationMaintenancePassTotalNanos atomic.Uint64 + vlogGenerationMaintenancePassMaxNanos atomic.Uint64 + vlogGenerationLastReason atomic.Uint32 + vlogGenerationCheckpointKickRuns atomic.Uint64 + vlogGenerationCheckpointKickRewriteRuns atomic.Uint64 + vlogGenerationCheckpointKickGCRuns atomic.Uint64 + vlogGenerationCheckpointKickPending atomic.Bool + vlogGenerationDeferredMaintenancePending atomic.Bool + vlogGenerationDeferredMaintenanceRunning atomic.Bool + vlogGenerationRewriteStageWakeObservedNS atomic.Int64 + vlogGenerationRewriteQueueMu sync.Mutex + vlogGenerationCheckpointKickActive atomic.Bool + vlogGenerationRewriteQueue []uint32 + vlogGenerationRewriteLedger []backenddb.ValueLogRewritePlanSegment + vlogGenerationRewritePenalties map[uint32]valueLogGenerationRewritePenalty + vlogGenerationRewriteStagePending bool + vlogGenerationRewriteStageObservedUnixNano int64 + vlogGenerationRewriteQueueLoaded bool + vlogGenerationLastChurnBps atomic.Int64 + vlogGenerationLastChurnSampleBytes atomic.Uint64 + vlogGenerationLastChurnSampleNS atomic.Int64 // Rewrite budget token bucket (bytes) for online maintenance. This lets us // interpret ValueLogRewriteBudgetBytesPerSec as a true per-second bandwidth // budget while still running maintenance at coarse intervals. @@ -12519,6 +12533,14 @@ func (db *DB) observeVlogGenerationGCStats(stats backenddb.ValueLogGCStats) { db.vlogGenerationLastGCSegmentsActive.Store(int64(stats.SegmentsActive)) db.vlogGenerationLastGCBytesProtected.Store(stats.BytesProtected) db.vlogGenerationLastGCSegmentsProtected.Store(int64(stats.SegmentsProtected)) + db.vlogGenerationLastGCBytesProtectedInUse.Store(stats.BytesProtectedInUse) + db.vlogGenerationLastGCSegmentsProtectedInUse.Store(int64(stats.SegmentsProtectedInUse)) + db.vlogGenerationLastGCBytesProtectedRetained.Store(stats.BytesProtectedRetained) + db.vlogGenerationLastGCSegmentsProtectedRetained.Store(int64(stats.SegmentsProtectedRetained)) + db.vlogGenerationLastGCBytesProtectedOverlap.Store(stats.BytesProtectedOverlap) + db.vlogGenerationLastGCSegmentsProtectedOverlap.Store(int64(stats.SegmentsProtectedOverlap)) + db.vlogGenerationLastGCBytesProtectedOther.Store(stats.BytesProtectedOther) + db.vlogGenerationLastGCSegmentsProtectedOther.Store(int64(stats.SegmentsProtectedOther)) db.vlogGenerationLastGCBytesEligible.Store(stats.BytesEligible) db.vlogGenerationLastGCSegmentsEligible.Store(int64(stats.SegmentsEligible)) db.vlogGenerationLastGCBytesDeleted.Store(stats.BytesDeleted) @@ -13975,9 +13997,7 @@ planned: if gcer, ok := db.backend.(backendValueLogGCer); ok { gcCtx, gcCancel := context.WithTimeout(context.Background(), 30*time.Second) gcStart := time.Now() - gcStats, gcErr := gcer.ValueLogGC(gcCtx, backenddb.ValueLogGCOptions{ - ProtectedPaths: db.valueLogProtectedPaths(), - }) + gcStats, gcErr := gcer.ValueLogGC(gcCtx, db.valueLogGCOptions(false)) gcCancel() gcDur := time.Since(gcStart) db.observeVlogGenerationGCExecDuration(gcDur) @@ -14186,7 +14206,7 @@ planned: now := time.Now() db.vlogGenerationLastGCUnixNano.Store(now.UnixNano()) ctx, cancel := db.foregroundMaintenanceContext(30 * time.Second) - gcOpts := backenddb.ValueLogGCOptions{ProtectedPaths: db.valueLogProtectedPaths()} + gcOpts := db.valueLogGCOptions(false) gcStart := time.Now() gcStats, err := gcer.ValueLogGC(ctx, gcOpts) cancel() @@ -14452,10 +14472,7 @@ func (db *DB) estimateVlogGenerationGCEligible(gcer backendValueLogGCer) (backen } ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() - stats, err := gcer.ValueLogGC(ctx, backenddb.ValueLogGCOptions{ - DryRun: true, - ProtectedPaths: db.valueLogProtectedPaths(), - }) + stats, err := gcer.ValueLogGC(ctx, db.valueLogGCOptions(true)) if err == nil { db.vlogGenerationLastGCDryRunUnixNano.Store(time.Now().UnixNano()) db.vlogGenerationLastGCDryRunBytesEligible.Store(stats.BytesEligible) @@ -19902,6 +19919,14 @@ func (db *DB) Stats() map[string]string { stats["treedb.cache.vlog_generation.gc.last_active_bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCBytesActive.Load()) stats["treedb.cache.vlog_generation.gc.last_protected_segments"] = fmt.Sprintf("%d", db.vlogGenerationLastGCSegmentsProtected.Load()) stats["treedb.cache.vlog_generation.gc.last_protected_bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCBytesProtected.Load()) + stats["treedb.cache.vlog_generation.gc.last_protected_in_use_segments"] = fmt.Sprintf("%d", db.vlogGenerationLastGCSegmentsProtectedInUse.Load()) + stats["treedb.cache.vlog_generation.gc.last_protected_in_use_bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCBytesProtectedInUse.Load()) + stats["treedb.cache.vlog_generation.gc.last_protected_retained_segments"] = fmt.Sprintf("%d", db.vlogGenerationLastGCSegmentsProtectedRetained.Load()) + stats["treedb.cache.vlog_generation.gc.last_protected_retained_bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCBytesProtectedRetained.Load()) + stats["treedb.cache.vlog_generation.gc.last_protected_overlap_segments"] = fmt.Sprintf("%d", db.vlogGenerationLastGCSegmentsProtectedOverlap.Load()) + stats["treedb.cache.vlog_generation.gc.last_protected_overlap_bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCBytesProtectedOverlap.Load()) + stats["treedb.cache.vlog_generation.gc.last_protected_other_segments"] = fmt.Sprintf("%d", db.vlogGenerationLastGCSegmentsProtectedOther.Load()) + stats["treedb.cache.vlog_generation.gc.last_protected_other_bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCBytesProtectedOther.Load()) stats["treedb.cache.vlog_generation.gc.last_eligible_segments"] = fmt.Sprintf("%d", db.vlogGenerationLastGCSegmentsEligible.Load()) stats["treedb.cache.vlog_generation.gc.last_eligible_bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCBytesEligible.Load()) stats["treedb.cache.vlog_generation.gc.last_deleted_segments"] = fmt.Sprintf("%d", db.vlogGenerationLastGCSegmentsDeleted.Load()) diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go index 7c75f7fd2..4bb6a8912 100644 --- a/TreeDB/caching/vlog_generation_scheduler_test.go +++ b/TreeDB/caching/vlog_generation_scheduler_test.go @@ -5719,6 +5719,14 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) { db.vlogGenerationLastGCBytesActive.Store(400) db.vlogGenerationLastGCSegmentsProtected.Store(3) db.vlogGenerationLastGCBytesProtected.Store(300) + db.vlogGenerationLastGCSegmentsProtectedInUse.Store(1) + db.vlogGenerationLastGCBytesProtectedInUse.Store(100) + db.vlogGenerationLastGCSegmentsProtectedRetained.Store(1) + db.vlogGenerationLastGCBytesProtectedRetained.Store(120) + db.vlogGenerationLastGCSegmentsProtectedOverlap.Store(1) + db.vlogGenerationLastGCBytesProtectedOverlap.Store(80) + db.vlogGenerationLastGCSegmentsProtectedOther.Store(0) + db.vlogGenerationLastGCBytesProtectedOther.Store(0) db.vlogGenerationLastGCSegmentsEligible.Store(6) db.vlogGenerationLastGCBytesEligible.Store(600) db.vlogGenerationLastGCSegmentsDeleted.Store(2) @@ -5794,6 +5802,30 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) { if got := stats["treedb.cache.vlog_generation.gc.last_protected_bytes"]; got != "300" { t.Fatalf("gc last protected bytes=%q want 300", got) } + if got := stats["treedb.cache.vlog_generation.gc.last_protected_in_use_segments"]; got != "1" { + t.Fatalf("gc last protected in use segments=%q want 1", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_protected_in_use_bytes"]; got != "100" { + t.Fatalf("gc last protected in use bytes=%q want 100", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_protected_retained_segments"]; got != "1" { + t.Fatalf("gc last protected retained segments=%q want 1", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_protected_retained_bytes"]; got != "120" { + t.Fatalf("gc last protected retained bytes=%q want 120", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_protected_overlap_segments"]; got != "1" { + t.Fatalf("gc last protected overlap segments=%q want 1", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_protected_overlap_bytes"]; got != "80" { + t.Fatalf("gc last protected overlap bytes=%q want 80", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_protected_other_segments"]; got != "0" { + t.Fatalf("gc last protected other segments=%q want 0", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_protected_other_bytes"]; got != "0" { + t.Fatalf("gc last protected other bytes=%q want 0", got) + } if got := stats["treedb.cache.vlog_generation.gc.last_eligible_segments"]; got != "6" { t.Fatalf("gc last eligible segments=%q want 6", got) } diff --git a/TreeDB/db/vlog_gc.go b/TreeDB/db/vlog_gc.go index 19ac1d43a..cbdc96e49 100644 --- a/TreeDB/db/vlog_gc.go +++ b/TreeDB/db/vlog_gc.go @@ -17,26 +17,42 @@ const valueLogKeepRecentSegmentsPerLane = 2 // ValueLogGCOptions controls value-log garbage collection. type ValueLogGCOptions struct { - DryRun bool + DryRun bool + // ProtectedPaths preserves legacy callers that provide a single merged set + // of protected paths. Prefer the specific ProtectedInUsePaths and + // ProtectedRetainedPaths fields for blocker classification. ProtectedPaths []string + // ProtectedInUsePaths are paths that may still be referenced by mutable + // in-memory state during online maintenance. + ProtectedInUsePaths []string + // ProtectedRetainedPaths are paths pinned by pointer lifecycle retention. + ProtectedRetainedPaths []string } // ValueLogGCStats summarizes value-log GC work. type ValueLogGCStats struct { - SegmentsTotal int - SegmentsReferenced int - SegmentsActive int - SegmentsProtected int - SegmentsEligible int - SegmentsDeleted int - SegmentsPending int - BytesTotal int64 - BytesReferenced int64 - BytesActive int64 - BytesProtected int64 - BytesEligible int64 - BytesDeleted int64 - BytesPending int64 + SegmentsTotal int + SegmentsReferenced int + SegmentsActive int + SegmentsProtected int + SegmentsProtectedInUse int + SegmentsProtectedRetained int + SegmentsProtectedOverlap int + SegmentsProtectedOther int + SegmentsEligible int + SegmentsDeleted int + SegmentsPending int + BytesTotal int64 + BytesReferenced int64 + BytesActive int64 + BytesProtected int64 + BytesProtectedInUse int64 + BytesProtectedRetained int64 + BytesProtectedOverlap int64 + BytesProtectedOther int64 + BytesEligible int64 + BytesDeleted int64 + BytesPending int64 } // ValueLogGC deletes fully-unreferenced value-log segments. @@ -83,8 +99,9 @@ func (db *DB) ValueLogGC(ctx context.Context, opts ValueLogGCOptions) (ValueLogG set = vm.CurrentSetNoRefresh() } keptIDs := currentValueLogIDs(set) - if len(opts.ProtectedPaths) > 0 { - if recent := recentValueLogIDsForProtectedPaths(set, valueLogKeepRecentSegmentsPerLane, opts.ProtectedPaths); len(recent) > 0 { + protectedAll := mergeUniqueNonEmptyPaths(opts.ProtectedPaths, opts.ProtectedInUsePaths, opts.ProtectedRetainedPaths) + if len(protectedAll) > 0 { + if recent := recentValueLogIDsForProtectedPaths(set, valueLogKeepRecentSegmentsPerLane, protectedAll); len(recent) > 0 { keptIDs = recent } } @@ -95,6 +112,20 @@ func (db *DB) ValueLogGC(ctx context.Context, opts ValueLogGCOptions) (ValueLogG } protectedPaths[path] = struct{}{} } + protectedInUsePaths := make(map[string]struct{}, len(opts.ProtectedInUsePaths)) + for _, path := range opts.ProtectedInUsePaths { + if path == "" { + continue + } + protectedInUsePaths[path] = struct{}{} + } + protectedRetainedPaths := make(map[string]struct{}, len(opts.ProtectedRetainedPaths)) + for _, path := range opts.ProtectedRetainedPaths { + if path == "" { + continue + } + protectedRetainedPaths[path] = struct{}{} + } type candidate struct { path string size int64 @@ -119,9 +150,29 @@ func (db *DB) ValueLogGC(ctx context.Context, opts ValueLogGCOptions) (ValueLogG stats.BytesActive += size continue } + _, inUseProtected := protectedInUsePaths[f.Path] + _, retainedProtected := protectedRetainedPaths[f.Path] + if inUseProtected || retainedProtected { + stats.SegmentsProtected++ + stats.BytesProtected += size + switch { + case inUseProtected && retainedProtected: + stats.SegmentsProtectedOverlap++ + stats.BytesProtectedOverlap += size + case inUseProtected: + stats.SegmentsProtectedInUse++ + stats.BytesProtectedInUse += size + default: + stats.SegmentsProtectedRetained++ + stats.BytesProtectedRetained += size + } + continue + } if _, ok := protectedPaths[f.Path]; ok { stats.SegmentsProtected++ stats.BytesProtected += size + stats.SegmentsProtectedOther++ + stats.BytesProtectedOther += size continue } @@ -208,6 +259,24 @@ func currentValueLogIDs(set *valuelog.Set) map[uint32]struct{} { return active } +func mergeUniqueNonEmptyPaths(pathSets ...[]string) []string { + seen := make(map[string]struct{}) + var out []string + for _, paths := range pathSets { + for _, path := range paths { + if path == "" { + continue + } + if _, ok := seen[path]; ok { + continue + } + seen[path] = struct{}{} + out = append(out, path) + } + } + return out +} + func recentValueLogIDs(set *valuelog.Set, keepPerLane int) map[uint32]struct{} { if keepPerLane <= 1 { return currentValueLogIDs(set) diff --git a/TreeDB/db/vlog_gc_test.go b/TreeDB/db/vlog_gc_test.go index fbdee4dc0..771f2b116 100644 --- a/TreeDB/db/vlog_gc_test.go +++ b/TreeDB/db/vlog_gc_test.go @@ -176,6 +176,77 @@ func TestValueLogGC_ProtectedPathsDoNotKeepHistoricalRewriteLanes(t *testing.T) } } +func TestValueLogGC_ProtectedPathBreakdownStats(t *testing.T) { + dir := t.TempDir() + + db, err := Open(Options{Dir: dir}) + if err != nil { + t.Fatalf("open: %v", err) + } + defer func() { _ = db.Close() }() + + for seq := 1; seq <= 5; seq++ { + seq := seq + appendPointersInNewSegment(t, dir, 0, uint32(seq), uint64(seq)*1_000, 1, func(int) []byte { + return bytes.Repeat([]byte(fmt.Sprintf("lane0-seq%d|", seq)), 32) + }) + } + + if err := db.RefreshValueLogSet(); err != nil { + t.Fatalf("RefreshValueLogSet: %v", err) + } + + inUseOnlyPath := filepath.Join(dir, "wal", "value-l0-000001.log") + retainedOnlyPath := filepath.Join(dir, "wal", "value-l0-000002.log") + overlapPath := filepath.Join(dir, "wal", "value-l0-000003.log") + + stats, err := db.ValueLogGC(context.Background(), ValueLogGCOptions{ + DryRun: true, + ProtectedInUsePaths: []string{inUseOnlyPath, overlapPath}, + ProtectedRetainedPaths: []string{retainedOnlyPath, overlapPath}, + }) + if err != nil { + t.Fatalf("ValueLogGC: %v", err) + } + + if stats.SegmentsTotal != 5 { + t.Fatalf("segments total=%d want 5", stats.SegmentsTotal) + } + if stats.SegmentsActive != 2 { + t.Fatalf("segments active=%d want 2", stats.SegmentsActive) + } + if stats.SegmentsProtected != 3 { + t.Fatalf("segments protected=%d want 3", stats.SegmentsProtected) + } + if stats.SegmentsProtectedInUse != 1 { + t.Fatalf("segments protected in-use=%d want 1", stats.SegmentsProtectedInUse) + } + if stats.SegmentsProtectedRetained != 1 { + t.Fatalf("segments protected retained=%d want 1", stats.SegmentsProtectedRetained) + } + if stats.SegmentsProtectedOverlap != 1 { + t.Fatalf("segments protected overlap=%d want 1", stats.SegmentsProtectedOverlap) + } + if stats.SegmentsProtectedOther != 0 { + t.Fatalf("segments protected other=%d want 0", stats.SegmentsProtectedOther) + } + if stats.SegmentsEligible != 0 { + t.Fatalf("segments eligible=%d want 0", stats.SegmentsEligible) + } + if stats.SegmentsDeleted != 0 { + t.Fatalf("segments deleted=%d want 0", stats.SegmentsDeleted) + } + if stats.BytesProtected <= 0 { + t.Fatalf("bytes protected=%d want >0", stats.BytesProtected) + } + if stats.BytesProtectedInUse <= 0 || stats.BytesProtectedRetained <= 0 || stats.BytesProtectedOverlap <= 0 { + t.Fatalf("expected non-zero protected byte buckets, got %+v", stats) + } + if stats.BytesProtectedOther != 0 { + t.Fatalf("bytes protected other=%d want 0", stats.BytesProtectedOther) + } +} + func TestValueLogGC_KeepsReferencedPointerSegments_WithOuterLeavesInValueLog(t *testing.T) { dir := t.TempDir() diff --git a/worklog/2026-03-27.md b/worklog/2026-03-27.md index b9725f86f..b44c770fb 100644 --- a/worklog/2026-03-27.md +++ b/worklog/2026-03-27.md @@ -418,3 +418,75 @@ - pre: `5035136550` - tool output: `segments_before=21 segments_after=16 bytes_before=4888181282 bytes_after=2076143228 records=985926` - post: `2115096516` + +- GC protected-class split instrumentation (retained vs in-use vs overlap): + - motivation: + - prior `gc.last_protected_{segments,bytes}` proved protection was the blocker class but did not identify whether protection came from in-memory in-use paths vs retained-path lifecycle pins. + - code changes: + - `TreeDB/db/vlog_gc.go` + - `ValueLogGCOptions` extended with: + - `ProtectedInUsePaths []string` + - `ProtectedRetainedPaths []string` + - `ValueLogGCStats` extended with protected split buckets: + - `SegmentsProtectedInUse`, `BytesProtectedInUse` + - `SegmentsProtectedRetained`, `BytesProtectedRetained` + - `SegmentsProtectedOverlap`, `BytesProtectedOverlap` + - `SegmentsProtectedOther`, `BytesProtectedOther` + - GC classification now tags protected candidates by class while preserving `SegmentsProtected/BytesProtected` totals. + - protected-lane recent-window keep logic now uses the union of legacy + split protected path lists. + - `TreeDB/caching/db.go` + - added helper `valueLogGCProtectedPathSets()` and `valueLogGCOptions(dryRun bool)` to pass split path sets into backend GC. + - `observeVlogGenerationGCStats` now records split protected classes. + - exported new stats keys: + - `treedb.cache.vlog_generation.gc.last_protected_in_use_{segments,bytes}` + - `treedb.cache.vlog_generation.gc.last_protected_retained_{segments,bytes}` + - `treedb.cache.vlog_generation.gc.last_protected_overlap_{segments,bytes}` + - `treedb.cache.vlog_generation.gc.last_protected_other_{segments,bytes}` + - tests: + - `TreeDB/db/vlog_gc_test.go` + - added `TestValueLogGC_ProtectedPathBreakdownStats` + - `TreeDB/caching/vlog_generation_scheduler_test.go` + - extended `TestVlogGenerationStats_ReportRewriteBacklogAndDurations` assertions for new keys. + +- Validation: + - `go test ./TreeDB/db -run 'TestValueLogGC_ProtectedPathsDoNotKeepHistoricalRewriteLanes|TestValueLogGC_ProtectedPathBreakdownStats' -count=1` + - `go test ./TreeDB/caching -run 'TestVlogGenerationStats_ReportRewriteBacklogAndDurations' -count=1` + - `go test ./TreeDB/db ./TreeDB/caching -count=1` + +- Live confirmation run for protected split counters: + - command: + - `LOCAL_GOMAP_DIR=/home/mikers/dev/snissn/gomap-phasehook-active TREEDB_OPEN_PROFILE=fast FREEZE_REMOTE_HEIGHT_AT_START=1 ~/run_celestia.sh` + - run home: + - `/home/mikers/.celestia-app-mainnet-treedb-20260327211238` + - final debug snapshot: + - `pprof-heap-max-rss-final-11733668k-20260327211812.debug_vars.json` + - application-instance counters: + - `rewrite.plan_selected_segments_total=3` + - `rewrite.exec.source_segments_total=2` + - `rewrite.runs=1` + - `rewrite.processed_stale_bytes=498580183` + - `rewrite.reclaimed_bytes=0` + - `gc.runs=1` + - `gc.deleted_bytes=0` + - `gc.last_protected_segments=2` + - `gc.last_protected_bytes=536873037` + - `gc.last_protected_in_use_segments=0` + - `gc.last_protected_in_use_bytes=0` + - `gc.last_protected_retained_segments=2` + - `gc.last_protected_retained_bytes=536873037` + - `gc.last_protected_overlap_segments=0` + - `gc.last_protected_overlap_bytes=0` + - `gc.last_protected_other_segments=0` + - `gc.last_protected_other_bytes=0` + - `gc.last_eligible_segments=0` + - `gc.last_eligible_bytes=0` + +- Interpretation update: + - for this run window, no-reclaim is attributable to retained-path protection (not in-use protection and not eligible/pending delete). + +- Offline headroom on same run home: + - command: + - `/home/mikers/dev/snissn/celestia-app-p4/build/treemap-local vlog-rewrite /home/mikers/.celestia-app-mainnet-treedb-20260327211238/data/application.db -rw` + - pre: `5266839216` + - tool output: `segments_before=22 segments_after=16 bytes_before=4993530542 bytes_after=2108841030 records=995454` + - post: `2148318606` From bcfc7ad3e8ea442c01026a6b4b7152384209a497 Mon Sep 17 00:00:00 2001 From: Mikers Date: Fri, 27 Mar 2026 22:43:44 -1000 Subject: [PATCH 13/61] treedb: instrument retained prune scheduling and force preemption --- TreeDB/caching/db.go | 454 +++++++++++++++++++--------- TreeDB/caching/db_test.go | 190 ++++++++++++ TreeDB/caching/expvar_stats.go | 1 + TreeDB/caching/expvar_stats_test.go | 4 + worklog/2026-03-27.md | 56 ++++ 5 files changed, 562 insertions(+), 143 deletions(-) diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go index e5ed17eba..c9dfa7c01 100644 --- a/TreeDB/caching/db.go +++ b/TreeDB/caching/db.go @@ -4190,13 +4190,37 @@ type valueLogSetRefresher interface { RefreshValueLogSet() error } -func (db *DB) pruneRetainedValueLogs() { +type retainedValueLogPruneStats struct { + RemovedSegments int + RemovedBytes int64 + AbortedForegroundWrites bool +} + +func (db *DB) valueLogClosedSegmentSize(path string) int64 { + if db == nil || path == "" { + return 0 + } + laneID, _, _, ok := parseLogSeq(filepath.Base(path)) + if !ok || laneID < 0 || laneID >= len(db.lanes) { + return 0 + } + l := &db.lanes[laneID] + l.vlogMu.Lock() + defer l.vlogMu.Unlock() + if l.vlogClosedSizes == nil { + return 0 + } + return l.vlogClosedSizes[path] +} + +func (db *DB) pruneRetainedValueLogs() retainedValueLogPruneStats { + var out retainedValueLogPruneStats if !db.valueLogEnabled() { - return + return out } paths := db.valueLogRetainedPaths() if len(paths) == 0 { - return + return out } inUse := make(map[string]struct{}) @@ -4209,27 +4233,34 @@ func (db *DB) pruneRetainedValueLogs() { if _, ok := inUse[path]; ok { continue } + size := db.valueLogClosedSegmentSize(path) if db.cleanupMissingRetainedValueLog(path) { + if size > 0 { + out.RemovedSegments++ + out.RemovedBytes += size + } continue } candidatePaths = append(candidatePaths, path) } if len(candidatePaths) == 0 { - return + return out } live, err := db.collectValueLogLiveIDsUntil(db.lastForegroundWriteUnixNano.Load()) if err != nil { if errors.Is(err, errForegroundWritesResumed) { - return + out.AbortedForegroundWrites = true + return out } db.reportError(fmt.Errorf("cachingdb: failed to scan value-log pointers: %w", err)) - return + return out } removed := false marked := false for _, path := range candidatePaths { + size := db.valueLogClosedSegmentSize(path) laneID, seq, valueLog, ok := parseLogSeq(filepath.Base(path)) if !ok || !valueLog { continue @@ -4252,9 +4283,17 @@ func (db *DB) pruneRetainedValueLogs() { if err := marker.MarkValueLogZombie(id); err != nil { if errors.Is(err, valuelog.ErrFileNotFound) && db.cleanupOrphanedRetainedValueLog(path) { removed = true + if size > 0 { + out.RemovedSegments++ + out.RemovedBytes += size + } continue } if db.cleanupMissingRetainedValueLog(path) { + if size > 0 { + out.RemovedSegments++ + out.RemovedBytes += size + } continue } db.reportError(fmt.Errorf("cachingdb: failed to mark value-log %d zombie: %w", id, err)) @@ -4268,6 +4307,10 @@ func (db *DB) pruneRetainedValueLogs() { db.untrackValueLogSegmentLocked(path) db.mu.Unlock() removed = true + if size > 0 { + out.RemovedSegments++ + out.RemovedBytes += size + } } db.forgetValueLogRetain(path) } @@ -4282,6 +4325,7 @@ func (db *DB) pruneRetainedValueLogs() { if removed { db.syncDirBestEffort(db.dir) } + return out } func (db *DB) retainedPrunePressureBytes() int64 { @@ -4325,6 +4369,10 @@ func (db *DB) retainedPrunePressureBytes() int64 { } func (db *DB) shouldScheduleRetainedValueLogPrune() bool { + return db.shouldScheduleRetainedValueLogPruneWithForce(false) +} + +func (db *DB) shouldScheduleRetainedValueLogPruneWithForce(force bool) bool { if db == nil || !db.valueLogEnabled() { return false } @@ -4332,22 +4380,67 @@ func (db *DB) shouldScheduleRetainedValueLogPrune() bool { if closed <= 0 { return false } + if force { + return true + } return closed >= db.retainedPrunePressureBytes() } +func (db *DB) waitForRetainedValueLogPruneQuietOrForce(quietWindow time.Duration) bool { + if db == nil { + return false + } + if quietWindow <= 0 { + return db.retainedPruneForceRequested.Swap(false) + } + ticker := time.NewTicker(foregroundMaintenancePollInterval()) + defer ticker.Stop() + for { + if db.closing.Load() { + return db.retainedPruneForceRequested.Swap(false) + } + if db.retainedPruneForceRequested.Swap(false) { + return true + } + if db.foregroundActivityQuietFor(time.Now(), quietWindow, vlogForegroundReadQuietWindow) { + return false + } + select { + case <-db.closeCh: + return db.retainedPruneForceRequested.Swap(false) + case <-ticker.C: + } + } +} + func (db *DB) scheduleRetainedValueLogPrune() { + db.scheduleRetainedValueLogPruneWithForce(false) +} + +func (db *DB) scheduleRetainedValueLogPruneForce() { + db.scheduleRetainedValueLogPruneWithForce(true) +} + +func (db *DB) scheduleRetainedValueLogPruneWithForce(force bool) { if db == nil || !db.valueLogEnabled() { return } + db.retainedValueLogPruneScheduleRequests.Add(1) + if force { + db.retainedPruneForceRequested.Store(true) + db.retainedValueLogPruneScheduleForcedRequests.Add(1) + } if db.testSkipRetainedPrune { return } db.retainedPruneMu.Lock() if db.closing.Load() { + db.retainedValueLogPruneScheduleSkipClosing.Add(1) db.retainedPruneMu.Unlock() return } if db.retainedPruneDone != nil { + db.retainedValueLogPruneScheduleSkipInFlight.Add(1) db.retainedPruneMu.Unlock() return } @@ -4361,8 +4454,17 @@ func (db *DB) scheduleRetainedValueLogPrune() { db.retainedPruneDone = nil db.retainedPruneMu.Unlock() }() - db.waitForForegroundMaintenanceQuietWindow(retainedPruneQuietWindow) - if !db.shouldScheduleRetainedValueLogPrune() { + effectiveForce := force || db.retainedPruneForceRequested.Swap(false) + if !effectiveForce { + effectiveForce = db.waitForRetainedValueLogPruneQuietOrForce(retainedPruneQuietWindow) + } + if !db.shouldScheduleRetainedValueLogPruneWithForce(effectiveForce) { + closed := db.valueLogRetainedClosedBytes.Load() + if closed <= 0 { + db.retainedValueLogPruneScheduleSkipNoClosedBytes.Add(1) + } else if !effectiveForce && closed < db.retainedPrunePressureBytes() { + db.retainedValueLogPruneScheduleSkipBelowPressure.Add(1) + } return } // Retained prune is opportunistic reclaim; do not compete with checkpoint @@ -4379,10 +4481,25 @@ func (db *DB) scheduleRetainedValueLogPrune() { now := time.Now() last := db.retainedPruneLastStartUnixNano.Load() if last > 0 && now.Sub(time.Unix(0, last)) < retainedPruneMinInterval { + db.retainedValueLogPruneScheduleSkipMinInterval.Add(1) return } db.retainedPruneLastStartUnixNano.Store(now.UnixNano()) - db.pruneRetainedValueLogs() + db.retainedValueLogPruneRuns.Add(1) + if effectiveForce { + db.retainedValueLogPruneForcedRuns.Add(1) + } + db.retainedValueLogPruneLastUnixNano.Store(now.UnixNano()) + pruneStats := db.pruneRetainedValueLogs() + if pruneStats.AbortedForegroundWrites { + db.retainedValueLogPruneForegroundAbortRuns.Add(1) + } + if pruneStats.RemovedSegments > 0 { + db.retainedValueLogPruneRemovedSegments.Add(uint64(pruneStats.RemovedSegments)) + } + if pruneStats.RemovedBytes > 0 { + db.retainedValueLogPruneRemovedBytes.Add(uint64(pruneStats.RemovedBytes)) + } }() } @@ -5148,140 +5265,154 @@ type DB struct { valueLogMaxSegmentBytes int64 journalCompression bool - disableJournal bool - relaxedSync bool - notifyError func(error) - debugFlushPointers bool - debugFlushTiming bool - debugPtrEligible atomic.Int64 - debugPtrUsed atomic.Int64 - debugPtrNoPtr atomic.Int64 - debugPtrDenied atomic.Int64 - debugPtrDisabled atomic.Int64 - checkpointRuns atomic.Uint64 - checkpointTotalNs atomic.Uint64 - checkpointMaxNs atomic.Uint64 - checkpointNoopSkips atomic.Uint64 - checkpointFlushMuWaitNs atomic.Uint64 - checkpointFlushMuWaitMaxNs atomic.Uint64 - checkpointAutoVacuumRuns atomic.Uint64 - checkpointAutoVacuumLastCheckRun atomic.Uint64 - checkpointAutoVacuumLastPages atomic.Uint64 - checkpointAutoVacuumLastInternalP50 atomic.Uint64 - checkpointAutoVacuumLastInternalAvg atomic.Uint64 - lastForegroundWriteUnixNano atomic.Int64 - lastForegroundReadUnixNano atomic.Int64 - foregroundReadStampCounter atomic.Uint32 - activeForegroundIterators atomic.Int64 - retainedPruneLastStartUnixNano atomic.Int64 - retainedPruneMu sync.Mutex - retainedPruneDone chan struct{} - vlogGenerationRemapSuccesses atomic.Uint64 - vlogGenerationRemapFailures atomic.Uint64 - vlogGenerationRewriteBytesIn atomic.Uint64 - vlogGenerationRewriteBytesOut atomic.Uint64 - vlogGenerationRewriteReclaimedBytes atomic.Uint64 - vlogGenerationRewriteProcessedLiveBytes atomic.Uint64 - vlogGenerationRewriteProcessedStaleBytes atomic.Uint64 - vlogGenerationRewriteNoReclaimRuns atomic.Uint64 - vlogGenerationRewriteNoReclaimStaleBytes atomic.Uint64 - vlogGenerationRewriteRuns atomic.Uint64 - vlogGenerationRewritePlanRuns atomic.Uint64 - vlogGenerationRewritePlanCanceled atomic.Uint64 - vlogGenerationRewritePlanErrors atomic.Uint64 - vlogGenerationRewritePlanEmpty atomic.Uint64 - vlogGenerationRewritePlanSelected atomic.Uint64 - vlogGenerationRewritePlanSelectedSegments atomic.Uint64 - vlogGenerationRewritePlanSelectedBytes atomic.Uint64 - vlogGenerationRewritePlanSelectedLiveBytes atomic.Uint64 - vlogGenerationRewritePlanSelectedStaleBytes atomic.Uint64 - vlogGenerationRewritePlanCanceledLastNS atomic.Int64 - vlogGenerationRewriteAgeBlockedUntilNS atomic.Int64 - vlogGenerationRewriteAgeBlockedWakeRunning atomic.Bool - vlogGenerationRewriteIneffectiveLastNS atomic.Int64 - vlogGenerationRewriteIneffectiveRuns atomic.Uint64 - vlogGenerationRewriteIneffectiveBytesIn atomic.Uint64 - vlogGenerationRewriteIneffectiveBytesOut atomic.Uint64 - vlogGenerationRewriteCanceledRuns atomic.Uint64 - vlogGenerationRewriteCanceledLastNS atomic.Int64 - vlogGenerationRewriteQueuePruneRuns atomic.Uint64 - vlogGenerationRewriteQueuePruneIDs atomic.Uint64 - vlogGenerationGCSegmentsDeleted atomic.Uint64 - vlogGenerationGCBytesDeleted atomic.Uint64 - vlogGenerationGCRuns atomic.Uint64 - vlogGenerationVacuumRuns atomic.Uint64 - vlogGenerationVacuumFailures atomic.Uint64 - vlogGenerationVacuumSkippedDisabled atomic.Uint64 - vlogGenerationVacuumSkippedRewriteBytes atomic.Uint64 - vlogGenerationVacuumSkippedCooldown atomic.Uint64 - vlogGenerationLastVacuumUnixNano atomic.Int64 - vlogGenerationLastRewritePlanUnixNano atomic.Int64 - vlogGenerationLastRewriteUnixNano atomic.Int64 - vlogGenerationLastGCUnixNano atomic.Int64 - vlogGenerationLastCheckpointKickUnixNano atomic.Int64 - vlogGenerationLastGCDryRunUnixNano atomic.Int64 - vlogGenerationLastGCDryRunBytesEligible atomic.Int64 - vlogGenerationLastGCDryRunSegsEligible atomic.Int64 - vlogGenerationLastGCBytesReferenced atomic.Int64 - vlogGenerationLastGCSegmentsReferenced atomic.Int64 - vlogGenerationLastGCBytesActive atomic.Int64 - vlogGenerationLastGCSegmentsActive atomic.Int64 - vlogGenerationLastGCBytesProtected atomic.Int64 - vlogGenerationLastGCSegmentsProtected atomic.Int64 - vlogGenerationLastGCBytesProtectedInUse atomic.Int64 - vlogGenerationLastGCSegmentsProtectedInUse atomic.Int64 - vlogGenerationLastGCBytesProtectedRetained atomic.Int64 - vlogGenerationLastGCSegmentsProtectedRetained atomic.Int64 - vlogGenerationLastGCBytesProtectedOverlap atomic.Int64 - vlogGenerationLastGCSegmentsProtectedOverlap atomic.Int64 - vlogGenerationLastGCBytesProtectedOther atomic.Int64 - vlogGenerationLastGCSegmentsProtectedOther atomic.Int64 - vlogGenerationLastGCBytesEligible atomic.Int64 - vlogGenerationLastGCSegmentsEligible atomic.Int64 - vlogGenerationLastGCBytesDeleted atomic.Int64 - vlogGenerationLastGCSegmentsDeleted atomic.Int64 - vlogGenerationLastGCBytesPending atomic.Int64 - vlogGenerationLastGCSegmentsPending atomic.Int64 - vlogGenerationChurnBytes atomic.Uint64 - vlogGenerationSchedulerState atomic.Uint32 - vlogGenerationMaintenanceActive atomic.Bool - vlogGenerationMaintenanceAttempts atomic.Uint64 - vlogGenerationMaintenanceAcquired atomic.Uint64 - vlogGenerationMaintenanceCollisions atomic.Uint64 - vlogGenerationMaintenanceSkipWALOnPeriodic atomic.Uint64 - vlogGenerationMaintenanceSkipPhase atomic.Uint64 - vlogGenerationMaintenanceSkipStageGate atomic.Uint64 - vlogGenerationMaintenanceSkipStageNotDue atomic.Uint64 - vlogGenerationMaintenanceSkipStageDue atomic.Uint64 - vlogGenerationMaintenanceSkipAgeBlocked atomic.Uint64 - vlogGenerationMaintenanceSkipPriority atomic.Uint64 - vlogGenerationMaintenanceSkipQuiet atomic.Uint64 - vlogGenerationMaintenanceSkipPreCheckpoint atomic.Uint64 - vlogGenerationMaintenanceSkipCheckpointing atomic.Uint64 - vlogGenerationMaintenancePassNoop atomic.Uint64 - vlogGenerationMaintenancePassWithRewrite atomic.Uint64 - vlogGenerationMaintenancePassWithGC atomic.Uint64 - vlogGenerationMaintenancePassTotalNanos atomic.Uint64 - vlogGenerationMaintenancePassMaxNanos atomic.Uint64 - vlogGenerationLastReason atomic.Uint32 - vlogGenerationCheckpointKickRuns atomic.Uint64 - vlogGenerationCheckpointKickRewriteRuns atomic.Uint64 - vlogGenerationCheckpointKickGCRuns atomic.Uint64 - vlogGenerationCheckpointKickPending atomic.Bool - vlogGenerationDeferredMaintenancePending atomic.Bool - vlogGenerationDeferredMaintenanceRunning atomic.Bool - vlogGenerationRewriteStageWakeObservedNS atomic.Int64 - vlogGenerationRewriteQueueMu sync.Mutex - vlogGenerationCheckpointKickActive atomic.Bool - vlogGenerationRewriteQueue []uint32 - vlogGenerationRewriteLedger []backenddb.ValueLogRewritePlanSegment - vlogGenerationRewritePenalties map[uint32]valueLogGenerationRewritePenalty - vlogGenerationRewriteStagePending bool - vlogGenerationRewriteStageObservedUnixNano int64 - vlogGenerationRewriteQueueLoaded bool - vlogGenerationLastChurnBps atomic.Int64 - vlogGenerationLastChurnSampleBytes atomic.Uint64 - vlogGenerationLastChurnSampleNS atomic.Int64 + disableJournal bool + relaxedSync bool + notifyError func(error) + debugFlushPointers bool + debugFlushTiming bool + debugPtrEligible atomic.Int64 + debugPtrUsed atomic.Int64 + debugPtrNoPtr atomic.Int64 + debugPtrDenied atomic.Int64 + debugPtrDisabled atomic.Int64 + checkpointRuns atomic.Uint64 + checkpointTotalNs atomic.Uint64 + checkpointMaxNs atomic.Uint64 + checkpointNoopSkips atomic.Uint64 + checkpointFlushMuWaitNs atomic.Uint64 + checkpointFlushMuWaitMaxNs atomic.Uint64 + checkpointAutoVacuumRuns atomic.Uint64 + checkpointAutoVacuumLastCheckRun atomic.Uint64 + checkpointAutoVacuumLastPages atomic.Uint64 + checkpointAutoVacuumLastInternalP50 atomic.Uint64 + checkpointAutoVacuumLastInternalAvg atomic.Uint64 + lastForegroundWriteUnixNano atomic.Int64 + lastForegroundReadUnixNano atomic.Int64 + foregroundReadStampCounter atomic.Uint32 + activeForegroundIterators atomic.Int64 + retainedPruneLastStartUnixNano atomic.Int64 + retainedValueLogPruneLastUnixNano atomic.Int64 + retainedValueLogPruneRuns atomic.Uint64 + retainedValueLogPruneForcedRuns atomic.Uint64 + retainedValueLogPruneForegroundAbortRuns atomic.Uint64 + retainedValueLogPruneRemovedSegments atomic.Uint64 + retainedValueLogPruneRemovedBytes atomic.Uint64 + retainedValueLogPruneScheduleRequests atomic.Uint64 + retainedValueLogPruneScheduleForcedRequests atomic.Uint64 + retainedValueLogPruneScheduleSkipClosing atomic.Uint64 + retainedValueLogPruneScheduleSkipInFlight atomic.Uint64 + retainedValueLogPruneScheduleSkipNoClosedBytes atomic.Uint64 + retainedValueLogPruneScheduleSkipBelowPressure atomic.Uint64 + retainedValueLogPruneScheduleSkipMinInterval atomic.Uint64 + retainedPruneForceRequested atomic.Bool + retainedPruneMu sync.Mutex + retainedPruneDone chan struct{} + vlogGenerationRemapSuccesses atomic.Uint64 + vlogGenerationRemapFailures atomic.Uint64 + vlogGenerationRewriteBytesIn atomic.Uint64 + vlogGenerationRewriteBytesOut atomic.Uint64 + vlogGenerationRewriteReclaimedBytes atomic.Uint64 + vlogGenerationRewriteProcessedLiveBytes atomic.Uint64 + vlogGenerationRewriteProcessedStaleBytes atomic.Uint64 + vlogGenerationRewriteNoReclaimRuns atomic.Uint64 + vlogGenerationRewriteNoReclaimStaleBytes atomic.Uint64 + vlogGenerationRewriteRuns atomic.Uint64 + vlogGenerationRewritePlanRuns atomic.Uint64 + vlogGenerationRewritePlanCanceled atomic.Uint64 + vlogGenerationRewritePlanErrors atomic.Uint64 + vlogGenerationRewritePlanEmpty atomic.Uint64 + vlogGenerationRewritePlanSelected atomic.Uint64 + vlogGenerationRewritePlanSelectedSegments atomic.Uint64 + vlogGenerationRewritePlanSelectedBytes atomic.Uint64 + vlogGenerationRewritePlanSelectedLiveBytes atomic.Uint64 + vlogGenerationRewritePlanSelectedStaleBytes atomic.Uint64 + vlogGenerationRewritePlanCanceledLastNS atomic.Int64 + vlogGenerationRewriteAgeBlockedUntilNS atomic.Int64 + vlogGenerationRewriteAgeBlockedWakeRunning atomic.Bool + vlogGenerationRewriteIneffectiveLastNS atomic.Int64 + vlogGenerationRewriteIneffectiveRuns atomic.Uint64 + vlogGenerationRewriteIneffectiveBytesIn atomic.Uint64 + vlogGenerationRewriteIneffectiveBytesOut atomic.Uint64 + vlogGenerationRewriteCanceledRuns atomic.Uint64 + vlogGenerationRewriteCanceledLastNS atomic.Int64 + vlogGenerationRewriteQueuePruneRuns atomic.Uint64 + vlogGenerationRewriteQueuePruneIDs atomic.Uint64 + vlogGenerationGCSegmentsDeleted atomic.Uint64 + vlogGenerationGCBytesDeleted atomic.Uint64 + vlogGenerationGCRuns atomic.Uint64 + vlogGenerationVacuumRuns atomic.Uint64 + vlogGenerationVacuumFailures atomic.Uint64 + vlogGenerationVacuumSkippedDisabled atomic.Uint64 + vlogGenerationVacuumSkippedRewriteBytes atomic.Uint64 + vlogGenerationVacuumSkippedCooldown atomic.Uint64 + vlogGenerationLastVacuumUnixNano atomic.Int64 + vlogGenerationLastRewritePlanUnixNano atomic.Int64 + vlogGenerationLastRewriteUnixNano atomic.Int64 + vlogGenerationLastGCUnixNano atomic.Int64 + vlogGenerationLastCheckpointKickUnixNano atomic.Int64 + vlogGenerationLastGCDryRunUnixNano atomic.Int64 + vlogGenerationLastGCDryRunBytesEligible atomic.Int64 + vlogGenerationLastGCDryRunSegsEligible atomic.Int64 + vlogGenerationLastGCBytesReferenced atomic.Int64 + vlogGenerationLastGCSegmentsReferenced atomic.Int64 + vlogGenerationLastGCBytesActive atomic.Int64 + vlogGenerationLastGCSegmentsActive atomic.Int64 + vlogGenerationLastGCBytesProtected atomic.Int64 + vlogGenerationLastGCSegmentsProtected atomic.Int64 + vlogGenerationLastGCBytesProtectedInUse atomic.Int64 + vlogGenerationLastGCSegmentsProtectedInUse atomic.Int64 + vlogGenerationLastGCBytesProtectedRetained atomic.Int64 + vlogGenerationLastGCSegmentsProtectedRetained atomic.Int64 + vlogGenerationLastGCBytesProtectedOverlap atomic.Int64 + vlogGenerationLastGCSegmentsProtectedOverlap atomic.Int64 + vlogGenerationLastGCBytesProtectedOther atomic.Int64 + vlogGenerationLastGCSegmentsProtectedOther atomic.Int64 + vlogGenerationLastGCBytesEligible atomic.Int64 + vlogGenerationLastGCSegmentsEligible atomic.Int64 + vlogGenerationLastGCBytesDeleted atomic.Int64 + vlogGenerationLastGCSegmentsDeleted atomic.Int64 + vlogGenerationLastGCBytesPending atomic.Int64 + vlogGenerationLastGCSegmentsPending atomic.Int64 + vlogGenerationChurnBytes atomic.Uint64 + vlogGenerationSchedulerState atomic.Uint32 + vlogGenerationMaintenanceActive atomic.Bool + vlogGenerationMaintenanceAttempts atomic.Uint64 + vlogGenerationMaintenanceAcquired atomic.Uint64 + vlogGenerationMaintenanceCollisions atomic.Uint64 + vlogGenerationMaintenanceSkipWALOnPeriodic atomic.Uint64 + vlogGenerationMaintenanceSkipPhase atomic.Uint64 + vlogGenerationMaintenanceSkipStageGate atomic.Uint64 + vlogGenerationMaintenanceSkipStageNotDue atomic.Uint64 + vlogGenerationMaintenanceSkipStageDue atomic.Uint64 + vlogGenerationMaintenanceSkipAgeBlocked atomic.Uint64 + vlogGenerationMaintenanceSkipPriority atomic.Uint64 + vlogGenerationMaintenanceSkipQuiet atomic.Uint64 + vlogGenerationMaintenanceSkipPreCheckpoint atomic.Uint64 + vlogGenerationMaintenanceSkipCheckpointing atomic.Uint64 + vlogGenerationMaintenancePassNoop atomic.Uint64 + vlogGenerationMaintenancePassWithRewrite atomic.Uint64 + vlogGenerationMaintenancePassWithGC atomic.Uint64 + vlogGenerationMaintenancePassTotalNanos atomic.Uint64 + vlogGenerationMaintenancePassMaxNanos atomic.Uint64 + vlogGenerationLastReason atomic.Uint32 + vlogGenerationCheckpointKickRuns atomic.Uint64 + vlogGenerationCheckpointKickRewriteRuns atomic.Uint64 + vlogGenerationCheckpointKickGCRuns atomic.Uint64 + vlogGenerationCheckpointKickPending atomic.Bool + vlogGenerationDeferredMaintenancePending atomic.Bool + vlogGenerationDeferredMaintenanceRunning atomic.Bool + vlogGenerationRewriteStageWakeObservedNS atomic.Int64 + vlogGenerationRewriteQueueMu sync.Mutex + vlogGenerationCheckpointKickActive atomic.Bool + vlogGenerationRewriteQueue []uint32 + vlogGenerationRewriteLedger []backenddb.ValueLogRewritePlanSegment + vlogGenerationRewritePenalties map[uint32]valueLogGenerationRewritePenalty + vlogGenerationRewriteStagePending bool + vlogGenerationRewriteStageObservedUnixNano int64 + vlogGenerationRewriteQueueLoaded bool + vlogGenerationLastChurnBps atomic.Int64 + vlogGenerationLastChurnSampleBytes atomic.Uint64 + vlogGenerationLastChurnSampleNS atomic.Int64 // Rewrite budget token bucket (bytes) for online maintenance. This lets us // interpret ValueLogRewriteBudgetBytesPerSec as a true per-second bandwidth // budget while still running maintenance at coarse intervals. @@ -7381,6 +7512,9 @@ func Open(dir string, backend BackendDB, opts Options) (*DB, error) { } l.vlogClosedSizes[seg.path] = seg.size l.vlogClosedBytes.Add(seg.size) + if _, retained := db.valueLogRetain[seg.path]; retained { + db.valueLogRetainedClosedBytes.Add(seg.size) + } } else { if seg.path == l.walPath { continue @@ -14006,6 +14140,19 @@ planned: return fmt.Errorf("generational gc after rewrite: %w", gcErr) } db.observeVlogGenerationGCStats(gcStats) + db.vlogGenerationGCRuns.Add(1) + if gcStats.SegmentsDeleted > 0 { + db.vlogGenerationGCSegmentsDeleted.Add(uint64(gcStats.SegmentsDeleted)) + } + if gcStats.BytesDeleted > 0 { + db.vlogGenerationGCBytesDeleted.Add(uint64(gcStats.BytesDeleted)) + } + if gcStats.BytesProtectedRetained > 0 && gcStats.BytesEligible == 0 && db.valueLogRetainedClosedBytes.Load() > 0 { + // Retained-path protection can starve live reclaim even when rewrite + // processed stale payload in-pass. Kick an eager retained prune so + // lifecycle pins can drain without waiting for byte-pressure gates. + db.scheduleRetainedValueLogPruneForce() + } if gcStats.BytesDeleted > 0 { gcBytesDeleted = int64(gcStats.BytesDeleted) effectiveBytesAfter -= gcBytesDeleted @@ -14215,6 +14362,11 @@ planned: return fmt.Errorf("generational gc: %w", err) } db.observeVlogGenerationGCStats(gcStats) + if gcStats.BytesProtectedRetained > 0 && gcStats.BytesEligible == 0 && db.valueLogRetainedClosedBytes.Load() > 0 { + // When GC classifies all reclaim blockers as retained-path protection, + // trigger an eager retained prune pass to release stale lifecycle pins. + db.scheduleRetainedValueLogPruneForce() + } db.vlogGenerationSchedulerState.Store(vlogGenerationSchedulerIdle) db.vlogGenerationGCRuns.Add(1) if gcStats.SegmentsDeleted > 0 { @@ -19785,6 +19937,22 @@ func (db *DB) Stats() map[string]string { stats["treedb.cache.vlog_retained_segments"] = fmt.Sprintf("%d", vlogSegments) stats["treedb.cache.vlog_retained_bytes_estimate"] = fmt.Sprintf("%d", vlogBytes) stats["treedb.process.memory.vlog_retained_bytes_estimate"] = fmt.Sprintf("%d", vlogBytes) + stats["treedb.cache.vlog_retained_prune.closed_bytes"] = fmt.Sprintf("%d", db.valueLogRetainedClosedBytes.Load()) + stats["treedb.cache.vlog_retained_prune.last_unix_nano"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastUnixNano.Load()) + stats["treedb.cache.vlog_retained_prune.runs"] = fmt.Sprintf("%d", db.retainedValueLogPruneRuns.Load()) + stats["treedb.cache.vlog_retained_prune.forced_runs"] = fmt.Sprintf("%d", db.retainedValueLogPruneForcedRuns.Load()) + stats["treedb.cache.vlog_retained_prune.foreground_abort_runs"] = fmt.Sprintf("%d", db.retainedValueLogPruneForegroundAbortRuns.Load()) + stats["treedb.cache.vlog_retained_prune.removed_segments"] = fmt.Sprintf("%d", db.retainedValueLogPruneRemovedSegments.Load()) + stats["treedb.cache.vlog_retained_prune.removed_bytes"] = fmt.Sprintf("%d", db.retainedValueLogPruneRemovedBytes.Load()) + stats["treedb.cache.vlog_retained_prune.pressure_bytes"] = fmt.Sprintf("%d", db.retainedPrunePressureBytes()) + stats["treedb.cache.vlog_retained_prune.schedule_requests"] = fmt.Sprintf("%d", db.retainedValueLogPruneScheduleRequests.Load()) + stats["treedb.cache.vlog_retained_prune.schedule_forced_requests"] = fmt.Sprintf("%d", db.retainedValueLogPruneScheduleForcedRequests.Load()) + stats["treedb.cache.vlog_retained_prune.schedule_skip.closing"] = fmt.Sprintf("%d", db.retainedValueLogPruneScheduleSkipClosing.Load()) + stats["treedb.cache.vlog_retained_prune.schedule_skip.inflight"] = fmt.Sprintf("%d", db.retainedValueLogPruneScheduleSkipInFlight.Load()) + stats["treedb.cache.vlog_retained_prune.schedule_skip.no_closed_bytes"] = fmt.Sprintf("%d", db.retainedValueLogPruneScheduleSkipNoClosedBytes.Load()) + stats["treedb.cache.vlog_retained_prune.schedule_skip.below_pressure"] = fmt.Sprintf("%d", db.retainedValueLogPruneScheduleSkipBelowPressure.Load()) + stats["treedb.cache.vlog_retained_prune.schedule_skip.min_interval"] = fmt.Sprintf("%d", db.retainedValueLogPruneScheduleSkipMinInterval.Load()) + stats["treedb.cache.vlog_retained_prune.force_pending"] = fmt.Sprintf("%t", db.retainedPruneForceRequested.Load()) stats["treedb.cache.vlog_generation.policy"] = fmt.Sprintf("%d", db.valueLogGenerationPolicy) stats["treedb.cache.vlog_generation.enabled"] = fmt.Sprintf("%t", db.valueLogGenerationPolicy == uint8(backenddb.ValueLogGenerationHotWarmCold)) stats["treedb.cache.vlog_generation.maintenance_phase"] = maintenancePhaseString(db.maintenancePhase.Load()) diff --git a/TreeDB/caching/db_test.go b/TreeDB/caching/db_test.go index cb6e8adb3..8882fb6db 100644 --- a/TreeDB/caching/db_test.go +++ b/TreeDB/caching/db_test.go @@ -1953,6 +1953,57 @@ func TestCachingDB_PrunesRetainedValueLog(t *testing.T) { } } +func TestOpen_InitializesRetainedClosedBytesFromExistingSegments(t *testing.T) { + dir := t.TempDir() + + opts := Options{ + DisableWAL: true, + RelaxedSync: true, + AllowUnsafe: true, + FlushThreshold: 1 << 20, + ValueLogPointerThreshold: 1, + } + + backend1, err := db.Open(db.Options{Dir: dir, ChunkSize: 64 * 1024}) + if err != nil { + t.Fatalf("backend1 open: %v", err) + } + cache1, err := Open(dir, backend1, opts) + if err != nil { + _ = backend1.Close() + t.Fatalf("cache1 open: %v", err) + } + + if err := cache1.Set([]byte("k"), bytes.Repeat([]byte("x"), page.DefaultInlineThreshold+256)); err != nil { + t.Fatalf("Set: %v", err) + } + cache1.flushAll(false) + if err := cache1.rotateValueLogLocked(&cache1.lanes[0]); err != nil { + t.Fatalf("rotateValueLogLocked: %v", err) + } + if got := cache1.valueLogRetainedClosedBytes.Load(); got <= 0 { + t.Fatalf("pre-close retained closed bytes=%d want >0", got) + } + if err := cache1.Close(); err != nil { + t.Fatalf("cache1 close: %v", err) + } + + backend2, err := db.Open(db.Options{Dir: dir, ChunkSize: 64 * 1024}) + if err != nil { + t.Fatalf("backend2 open: %v", err) + } + cache2, err := Open(dir, backend2, opts) + if err != nil { + _ = backend2.Close() + t.Fatalf("cache2 open: %v", err) + } + defer cache2.Close() + + if got := cache2.valueLogRetainedClosedBytes.Load(); got <= 0 { + t.Fatalf("reopen retained closed bytes=%d want >0", got) + } +} + func TestPruneRetainedValueLogs_SkipsLiveScanWhenAllRetainedPathsInUse(t *testing.T) { dir := t.TempDir() backend := NewMockBackend() @@ -2361,6 +2412,145 @@ func TestCheckpoint_SkipsRetainedValueLogPruneBelowPressureThreshold(t *testing. if cache.retainedPruneActive() { cache.waitForRetainedValueLogPrune() } + stats := cache.Stats() + if got := stats["treedb.cache.vlog_retained_prune.schedule_requests"]; got != "1" { + t.Fatalf("schedule_requests=%q want 1", got) + } + if got := stats["treedb.cache.vlog_retained_prune.schedule_forced_requests"]; got != "0" { + t.Fatalf("schedule_forced_requests=%q want 0", got) + } + if got := stats["treedb.cache.vlog_retained_prune.schedule_skip.below_pressure"]; got != "1" { + t.Fatalf("schedule_skip.below_pressure=%q want 1", got) + } + if got := stats["treedb.cache.vlog_retained_prune.closed_bytes"]; got != "128" { + t.Fatalf("closed_bytes=%q want 128", got) + } +} + +func TestRetainedValueLogPruneForce_BypassesPressureThreshold(t *testing.T) { + dir := t.TempDir() + backend := NewMockBackend() + backend.iteratorStartedCh = make(chan struct{}) + backend.iteratorBlockCh = make(chan struct{}) + + cache, err := Open(dir, backend, Options{ + DisableWAL: true, + RelaxedSync: true, + AllowUnsafe: true, + FlushThreshold: 1 << 20, + MaxValueLogRetainedBytes: 1 << 20, + ValueLogPointerThreshold: 1, + }) + if err != nil { + t.Fatalf("cache open: %v", err) + } + defer cache.Close() + + fileID, err := valuelog.EncodeFileID(0, 245) + if err != nil { + t.Fatalf("EncodeFileID: %v", err) + } + retainedPath := filepath.Join(dir, "wal", "value-l0-000245.log") + if err := os.MkdirAll(filepath.Dir(retainedPath), 0o755); err != nil { + t.Fatalf("MkdirAll: %v", err) + } + w, err := valuelog.NewWriter(retainedPath, fileID) + if err != nil { + t.Fatalf("NewWriter: %v", err) + } + if _, err := w.Append(0, nil, 1, bytes.Repeat([]byte("t"), 128)); err != nil { + _ = w.Close() + t.Fatalf("Append: %v", err) + } + if err := w.Close(); err != nil { + t.Fatalf("Close writer: %v", err) + } + cache.markValueLogRetain(retainedPath) + seedRetainedPrunePressure(cache, retainedPath, 128) + cache.lastForegroundWriteUnixNano.Store(time.Now().Add(-2 * retainedPruneQuietWindow).UnixNano()) + + cache.scheduleRetainedValueLogPruneForce() + + select { + case <-backend.iteratorStartedCh: + case <-time.After(2 * time.Second): + t.Fatalf("forced retained prune did not start below pressure threshold") + } + close(backend.iteratorBlockCh) + cache.waitForRetainedValueLogPrune() + stats := cache.Stats() + if got := stats["treedb.cache.vlog_retained_prune.schedule_forced_requests"]; got != "1" { + t.Fatalf("schedule_forced_requests=%q want 1", got) + } + if got := stats["treedb.cache.vlog_retained_prune.forced_runs"]; got != "1" { + t.Fatalf("forced_runs=%q want 1", got) + } +} + +func TestRetainedValueLogPruneForce_PreemptsQuietWait(t *testing.T) { + dir := t.TempDir() + backend := NewMockBackend() + backend.iteratorStartedCh = make(chan struct{}) + backend.iteratorBlockCh = make(chan struct{}) + + cache, err := Open(dir, backend, Options{ + DisableWAL: true, + RelaxedSync: true, + AllowUnsafe: true, + FlushThreshold: 1 << 20, + ValueLogPointerThreshold: 1, + }) + if err != nil { + t.Fatalf("cache open: %v", err) + } + defer cache.Close() + + fileID, err := valuelog.EncodeFileID(0, 246) + if err != nil { + t.Fatalf("EncodeFileID: %v", err) + } + retainedPath := filepath.Join(dir, "wal", "value-l0-000246.log") + if err := os.MkdirAll(filepath.Dir(retainedPath), 0o755); err != nil { + t.Fatalf("MkdirAll: %v", err) + } + w, err := valuelog.NewWriter(retainedPath, fileID) + if err != nil { + t.Fatalf("NewWriter: %v", err) + } + if _, err := w.Append(0, nil, 1, bytes.Repeat([]byte("u"), 128)); err != nil { + _ = w.Close() + t.Fatalf("Append: %v", err) + } + if err := w.Close(); err != nil { + t.Fatalf("Close writer: %v", err) + } + cache.markValueLogRetain(retainedPath) + seedRetainedPrunePressure(cache, retainedPath, 2<<30) + cache.lastForegroundWriteUnixNano.Store(time.Now().UnixNano()) + + cache.scheduleRetainedValueLogPrune() + select { + case <-backend.iteratorStartedCh: + t.Fatalf("retained prune started before quiet window elapsed") + case <-time.After(retainedPruneNegativeAssertWait): + } + + cache.scheduleRetainedValueLogPruneForce() + + select { + case <-backend.iteratorStartedCh: + case <-time.After(2 * time.Second): + t.Fatalf("forced retained prune did not preempt quiet-window wait") + } + close(backend.iteratorBlockCh) + cache.waitForRetainedValueLogPrune() + stats := cache.Stats() + if got := stats["treedb.cache.vlog_retained_prune.schedule_forced_requests"]; got != "1" { + t.Fatalf("schedule_forced_requests=%q want 1", got) + } + if got := stats["treedb.cache.vlog_retained_prune.forced_runs"]; got != "1" { + t.Fatalf("forced_runs=%q want 1", got) + } } func TestCheckpoint_DoesNotWaitForPriorRetainedValueLogPrune(t *testing.T) { diff --git a/TreeDB/caching/expvar_stats.go b/TreeDB/caching/expvar_stats.go index 5a7240f46..3e9f2b7ed 100644 --- a/TreeDB/caching/expvar_stats.go +++ b/TreeDB/caching/expvar_stats.go @@ -140,6 +140,7 @@ func selectTreeDBExpvarStats(stats map[string]string) map[string]any { strings.HasPrefix(k, "treedb.cache.vlog_auto.") || strings.HasPrefix(k, "treedb.cache.vlog_dict.") || strings.HasPrefix(k, "treedb.cache.vlog_generation.") || + strings.HasPrefix(k, "treedb.cache.vlog_retained_prune.") || strings.HasPrefix(k, "treedb.cache.vlog_payload_kind.") || strings.HasPrefix(k, "treedb.cache.vlog_outer_leaf_codec.") || strings.HasPrefix(k, "treedb.cache.batch_arena.") { diff --git a/TreeDB/caching/expvar_stats_test.go b/TreeDB/caching/expvar_stats_test.go index ff1982510..cc88d93ab 100644 --- a/TreeDB/caching/expvar_stats_test.go +++ b/TreeDB/caching/expvar_stats_test.go @@ -29,6 +29,7 @@ func TestSelectTreeDBExpvarStatsFiltersAndCoerces(t *testing.T) { "treedb.cache.vlog_payload_kind.raw_bytes.single_value": "2048", "treedb.cache.vlog_outer_leaf_codec.raw_bytes.lz4": "512", "treedb.cache.vlog_generation.rewrite.reclaimed_bytes": "1234", + "treedb.cache.vlog_retained_prune.runs": "3", "treedb.process.memory.heap_inuse_bytes": "4096", "treedb.process.memory.pool_pressure_level": "critical", "treedb.cache.batch_arena.pool_bytes_estimate": "65536", @@ -84,6 +85,9 @@ func TestSelectTreeDBExpvarStatsFiltersAndCoerces(t *testing.T) { if v, ok := got["treedb.cache.vlog_generation.rewrite.reclaimed_bytes"].(int64); !ok || v != 1234 { t.Fatalf("vlog_generation.rewrite.reclaimed_bytes=%T(%v) want int64(1234)", got["treedb.cache.vlog_generation.rewrite.reclaimed_bytes"], got["treedb.cache.vlog_generation.rewrite.reclaimed_bytes"]) } + if v, ok := got["treedb.cache.vlog_retained_prune.runs"].(int64); !ok || v != 3 { + t.Fatalf("vlog_retained_prune.runs=%T(%v) want int64(3)", got["treedb.cache.vlog_retained_prune.runs"], got["treedb.cache.vlog_retained_prune.runs"]) + } if v, ok := got["treedb.process.memory.heap_inuse_bytes"].(int64); !ok || v != 4096 { t.Fatalf("heap_inuse_bytes=%T(%v) want int64(4096)", got["treedb.process.memory.heap_inuse_bytes"], got["treedb.process.memory.heap_inuse_bytes"]) } diff --git a/worklog/2026-03-27.md b/worklog/2026-03-27.md index b44c770fb..c4178103a 100644 --- a/worklog/2026-03-27.md +++ b/worklog/2026-03-27.md @@ -490,3 +490,59 @@ - pre: `5266839216` - tool output: `segments_before=22 segments_after=16 bytes_before=4993530542 bytes_after=2108841030 records=995454` - post: `2148318606` + +## Retained-Prune Scheduling Instrumentation + Force Preemption (late update) + +- Code updates: + - `TreeDB/caching/db.go` + - retained-prune now exports explicit counters: + - `treedb.cache.vlog_retained_prune.closed_bytes` + - `treedb.cache.vlog_retained_prune.pressure_bytes` + - `treedb.cache.vlog_retained_prune.schedule_requests` + - `treedb.cache.vlog_retained_prune.schedule_forced_requests` + - `treedb.cache.vlog_retained_prune.schedule_skip.{closing,inflight,no_closed_bytes,below_pressure,min_interval}` + - `treedb.cache.vlog_retained_prune.force_pending` + - plus run/outcome counters (`runs`, `forced_runs`, `foreground_abort_runs`, `removed_*`). + - expvar now exports `treedb.cache.vlog_retained_prune.*` via allowlist. + - forced retained-prune requests can preempt a currently inflight quiet-window wait (instead of being starved by `schedule_skip.inflight`). + - retained-prune force trigger from GC-after-rewrite/periodic-GC is gated on `valueLogRetainedClosedBytes > 0`. + - `vlog_generation.gc.{runs,deleted_*}` accounting is updated in post-rewrite GC path as well. + - open-path retained closed-byte initialization includes existing retained segments found at startup. + - `TreeDB/caching/db_test.go` + - added: + - `TestOpen_InitializesRetainedClosedBytesFromExistingSegments` + - `TestRetainedValueLogPruneForce_BypassesPressureThreshold` + - `TestRetainedValueLogPruneForce_PreemptsQuietWait` + - extended `TestCheckpoint_SkipsRetainedValueLogPruneBelowPressureThreshold` with schedule/closed-byte assertions. + - `TreeDB/caching/expvar_stats_test.go` + - selector test now verifies retained-prune family inclusion. + +- Validation: + - focused: + - `go test ./TreeDB/caching -run 'TestSelectTreeDBExpvarStatsFiltersAndCoerces|TestOpen_InitializesRetainedClosedBytesFromExistingSegments|TestRetainedValueLogPruneForce_BypassesPressureThreshold|TestRetainedValueLogPruneForce_PreemptsQuietWait|TestCheckpoint_SkipsRetainedValueLogPruneBelowPressureThreshold|TestCheckpoint_DoesNotWaitForPriorRetainedValueLogPrune|TestCheckpoint_SchedulesRetainedValueLogPruneAsynchronously|TestCheckpoint_DefersRetainedValueLogPruneUntilForegroundQuiet|TestRetainedValueLogPrune_AbortsWhenForegroundWritesResume|TestCheckpoint_RateLimitsRetainedValueLogPrune|TestBackendMaintenance_DoesNotBlockOnRetainedValueLogPruneQuietWindow|TestVlogGenerationStats_ReportRewriteBacklogAndDurations' -count=1` + - full: + - `go test ./TreeDB/caching ./TreeDB/db -count=1` + +- Live run readouts: + - run home: `/home/mikers/.celestia-app-mainnet-treedb-20260327222148` + - snapshot: `pprof-heap-max-rss-final-11238156k-20260327222731.debug_vars.json` + - application instance: + - `rewrite.runs=1`, `rewrite.processed_stale_bytes=498581053`, `rewrite.reclaimed_bytes=0` + - `gc.runs=2`, `gc.last_protected_retained_bytes=536873907`, `gc.last_eligible_bytes=0` + - `retained_prune.closed_bytes=5100295854` + - `retained_prune.pressure_bytes=17179869164` + - `retained_prune.schedule_requests=1551` + - `retained_prune.schedule_forced_requests=1` + - `retained_prune.schedule_skip.inflight=1549` + - `retained_prune.runs=0` + - interpretation: + - before force-preemption fix, one inflight quiet-window worker starved later forced request. + - run home: `/home/mikers/.celestia-app-mainnet-treedb-20260327223633` + - snapshot: `pprof-heap-max-rss-8938296k-20260327224120.debug_vars.json` + - application instance: + - `rewrite.runs=0` (forced path not exercised on this short window) + - `retained_prune.closed_bytes=4563428411` + - `retained_prune.schedule_requests=185` + - `retained_prune.schedule_forced_requests=0` + - `retained_prune.schedule_skip.inflight=183` + - `retained_prune.runs=0` From da497ab14e9048b40d7011452b5459ac30da85be Mon Sep 17 00:00:00 2001 From: Mikers Date: Fri, 27 Mar 2026 23:07:10 -1000 Subject: [PATCH 14/61] treedb: retry forced retained-prune scan without write gate --- TreeDB/caching/db.go | 23 ++++++++++- TreeDB/caching/db_test.go | 81 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 101 insertions(+), 3 deletions(-) diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go index c9dfa7c01..ae87b8566 100644 --- a/TreeDB/caching/db.go +++ b/TreeDB/caching/db.go @@ -4194,6 +4194,8 @@ type retainedValueLogPruneStats struct { RemovedSegments int RemovedBytes int64 AbortedForegroundWrites bool + RetriedWithoutWriteGate bool + RetrySucceeded bool } func (db *DB) valueLogClosedSegmentSize(path string) int64 { @@ -4213,7 +4215,7 @@ func (db *DB) valueLogClosedSegmentSize(path string) int64 { return l.vlogClosedSizes[path] } -func (db *DB) pruneRetainedValueLogs() retainedValueLogPruneStats { +func (db *DB) pruneRetainedValueLogs(force bool) retainedValueLogPruneStats { var out retainedValueLogPruneStats if !db.valueLogEnabled() { return out @@ -4248,6 +4250,13 @@ func (db *DB) pruneRetainedValueLogs() retainedValueLogPruneStats { } live, err := db.collectValueLogLiveIDsUntil(db.lastForegroundWriteUnixNano.Load()) + if err != nil && force && errors.Is(err, errForegroundWritesResumed) { + out.RetriedWithoutWriteGate = true + live, err = db.collectValueLogLiveIDsUntil(0) + if err == nil { + out.RetrySucceeded = true + } + } if err != nil { if errors.Is(err, errForegroundWritesResumed) { out.AbortedForegroundWrites = true @@ -4490,7 +4499,13 @@ func (db *DB) scheduleRetainedValueLogPruneWithForce(force bool) { db.retainedValueLogPruneForcedRuns.Add(1) } db.retainedValueLogPruneLastUnixNano.Store(now.UnixNano()) - pruneStats := db.pruneRetainedValueLogs() + pruneStats := db.pruneRetainedValueLogs(effectiveForce) + if pruneStats.RetriedWithoutWriteGate { + db.retainedValueLogPruneWriteGateRetries.Add(1) + if pruneStats.RetrySucceeded { + db.retainedValueLogPruneWriteGateRetrySuccesses.Add(1) + } + } if pruneStats.AbortedForegroundWrites { db.retainedValueLogPruneForegroundAbortRuns.Add(1) } @@ -5304,6 +5319,8 @@ type DB struct { retainedValueLogPruneScheduleSkipNoClosedBytes atomic.Uint64 retainedValueLogPruneScheduleSkipBelowPressure atomic.Uint64 retainedValueLogPruneScheduleSkipMinInterval atomic.Uint64 + retainedValueLogPruneWriteGateRetries atomic.Uint64 + retainedValueLogPruneWriteGateRetrySuccesses atomic.Uint64 retainedPruneForceRequested atomic.Bool retainedPruneMu sync.Mutex retainedPruneDone chan struct{} @@ -19952,6 +19969,8 @@ func (db *DB) Stats() map[string]string { stats["treedb.cache.vlog_retained_prune.schedule_skip.no_closed_bytes"] = fmt.Sprintf("%d", db.retainedValueLogPruneScheduleSkipNoClosedBytes.Load()) stats["treedb.cache.vlog_retained_prune.schedule_skip.below_pressure"] = fmt.Sprintf("%d", db.retainedValueLogPruneScheduleSkipBelowPressure.Load()) stats["treedb.cache.vlog_retained_prune.schedule_skip.min_interval"] = fmt.Sprintf("%d", db.retainedValueLogPruneScheduleSkipMinInterval.Load()) + stats["treedb.cache.vlog_retained_prune.write_gate_retries"] = fmt.Sprintf("%d", db.retainedValueLogPruneWriteGateRetries.Load()) + stats["treedb.cache.vlog_retained_prune.write_gate_retry_successes"] = fmt.Sprintf("%d", db.retainedValueLogPruneWriteGateRetrySuccesses.Load()) stats["treedb.cache.vlog_retained_prune.force_pending"] = fmt.Sprintf("%t", db.retainedPruneForceRequested.Load()) stats["treedb.cache.vlog_generation.policy"] = fmt.Sprintf("%d", db.valueLogGenerationPolicy) stats["treedb.cache.vlog_generation.enabled"] = fmt.Sprintf("%t", db.valueLogGenerationPolicy == uint8(backenddb.ValueLogGenerationHotWarmCold)) diff --git a/TreeDB/caching/db_test.go b/TreeDB/caching/db_test.go index 8882fb6db..3c8d029f5 100644 --- a/TreeDB/caching/db_test.go +++ b/TreeDB/caching/db_test.go @@ -2027,7 +2027,7 @@ func TestPruneRetainedValueLogs_SkipsLiveScanWhenAllRetainedPathsInUse(t *testin } cache.markValueLogRetain(retained) - cache.pruneRetainedValueLogs() + cache.pruneRetainedValueLogs(false) backend.mu.RLock() iteratorCalls := backend.iteratorCalls @@ -2279,6 +2279,85 @@ func TestRetainedValueLogPrune_AbortsWhenForegroundWritesResume(t *testing.T) { } } +func TestRetainedValueLogPruneForce_RetriesAfterForegroundWritesResume(t *testing.T) { + dir := t.TempDir() + backend := NewMockBackend() + backend.iteratorStartedCh = make(chan struct{}) + backend.iteratorBlockCh = make(chan struct{}) + + cache, err := Open(dir, backend, Options{ + DisableWAL: true, + RelaxedSync: true, + AllowUnsafe: true, + FlushThreshold: 1 << 20, + ValueLogPointerThreshold: 1, + }) + if err != nil { + t.Fatalf("cache open: %v", err) + } + defer cache.Close() + + fileID, err := valuelog.EncodeFileID(0, 212) + if err != nil { + t.Fatalf("EncodeFileID: %v", err) + } + retainedPath := filepath.Join(dir, "wal", "value-l0-000212.log") + if err := os.MkdirAll(filepath.Dir(retainedPath), 0o755); err != nil { + t.Fatalf("MkdirAll: %v", err) + } + w, err := valuelog.NewWriter(retainedPath, fileID) + if err != nil { + t.Fatalf("NewWriter: %v", err) + } + if _, err := w.Append(0, nil, 1, bytes.Repeat([]byte("r"), 128)); err != nil { + _ = w.Close() + t.Fatalf("Append: %v", err) + } + if err := w.Close(); err != nil { + t.Fatalf("Close writer: %v", err) + } + cache.markValueLogRetain(retainedPath) + seedRetainedPrunePressure(cache, retainedPath, 2<<30) + cache.lastForegroundWriteUnixNano.Store(time.Now().Add(-2 * retainedPruneQuietWindow).UnixNano()) + + cache.scheduleRetainedValueLogPruneForce() + + select { + case <-backend.iteratorStartedCh: + case <-time.After(2 * time.Second): + t.Fatalf("forced prune did not start") + } + + lastWrite := cache.lastForegroundWriteUnixNano.Load() + deadline := time.Now().Add(2 * time.Second) + for !cache.foregroundWritesResumedSince(lastWrite) { + if time.Now().After(deadline) { + t.Fatalf("foreground write timestamp did not advance") + } + cache.noteWrite() + time.Sleep(time.Millisecond) + } + close(backend.iteratorBlockCh) + cache.waitForRetainedValueLogPrune() + + if cache.valueLogRetained(retainedPath) { + t.Fatalf("retained path still marked after forced retry prune") + } + stats := cache.Stats() + if got := stats["treedb.cache.vlog_retained_prune.forced_runs"]; got != "1" { + t.Fatalf("forced_runs=%q want 1", got) + } + if got := stats["treedb.cache.vlog_retained_prune.foreground_abort_runs"]; got != "0" { + t.Fatalf("foreground_abort_runs=%q want 0", got) + } + if got := stats["treedb.cache.vlog_retained_prune.write_gate_retries"]; got != "1" { + t.Fatalf("write_gate_retries=%q want 1", got) + } + if got := stats["treedb.cache.vlog_retained_prune.write_gate_retry_successes"]; got != "1" { + t.Fatalf("write_gate_retry_successes=%q want 1", got) + } +} + func TestCheckpoint_RateLimitsRetainedValueLogPrune(t *testing.T) { dir := t.TempDir() backend := NewMockBackend() From c2007567f786975de9a0029fd7f5992bcaa7a59e Mon Sep 17 00:00:00 2001 From: Mikers Date: Fri, 27 Mar 2026 23:27:22 -1000 Subject: [PATCH 15/61] treedb: force retained prune when vlog hard cap is exceeded --- TreeDB/caching/db.go | 3 +++ TreeDB/caching/db_test.go | 36 +++++++++++++++++++++++++++++++ TreeDB/env_vlog_overrides_test.go | 13 +++++++++++ TreeDB/public.go | 8 +++++++ 4 files changed, 60 insertions(+) diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go index ae87b8566..43293f422 100644 --- a/TreeDB/caching/db.go +++ b/TreeDB/caching/db.go @@ -4175,6 +4175,9 @@ func (db *DB) allowValueLogPointers() bool { if bytes >= limit { if db.valueLogHardCapWarned.CompareAndSwap(false, true) { db.reportError(fmt.Errorf("cachingdb: retained value-log bytes %d exceed hard cap %d; disabling new value-log pointers", bytes, limit)) + // Hard-cap entry means retained bytes are now constraining placement. + // Request an eager retained prune so lifecycle pins can drain promptly. + db.scheduleRetainedValueLogPruneForce() } return false } diff --git a/TreeDB/caching/db_test.go b/TreeDB/caching/db_test.go index 3c8d029f5..3f8c3151f 100644 --- a/TreeDB/caching/db_test.go +++ b/TreeDB/caching/db_test.go @@ -2358,6 +2358,42 @@ func TestRetainedValueLogPruneForce_RetriesAfterForegroundWritesResume(t *testin } } +func TestAllowValueLogPointers_HardCapRequestsForcedRetainedPrune(t *testing.T) { + cache := &DB{} + cache.testSkipRetainedPrune = true + cache.maxValueLogRetainedBytesHard = 1024 + cache.valueLogRetainedClosedBytes.Store(2048) + + if cache.allowValueLogPointers() { + t.Fatalf("allowValueLogPointers=true, want false when hard cap exceeded") + } + if got := cache.retainedValueLogPruneScheduleForcedRequests.Load(); got != 1 { + t.Fatalf("schedule_forced_requests=%d want 1 after first hard-cap crossing", got) + } + + // Re-check while still over cap should not repeatedly re-schedule until + // retained bytes drop back below the hard cap. + if cache.allowValueLogPointers() { + t.Fatalf("allowValueLogPointers=true on repeated over-cap check, want false") + } + if got := cache.retainedValueLogPruneScheduleForcedRequests.Load(); got != 1 { + t.Fatalf("schedule_forced_requests=%d want 1 after repeated over-cap check", got) + } + + cache.valueLogRetainedClosedBytes.Store(0) + if !cache.allowValueLogPointers() { + t.Fatalf("allowValueLogPointers=false, want true after dropping below hard cap") + } + + cache.valueLogRetainedClosedBytes.Store(4096) + if cache.allowValueLogPointers() { + t.Fatalf("allowValueLogPointers=true after second hard-cap crossing, want false") + } + if got := cache.retainedValueLogPruneScheduleForcedRequests.Load(); got != 2 { + t.Fatalf("schedule_forced_requests=%d want 2 after second hard-cap crossing", got) + } +} + func TestCheckpoint_RateLimitsRetainedValueLogPrune(t *testing.T) { dir := t.TempDir() backend := NewMockBackend() diff --git a/TreeDB/env_vlog_overrides_test.go b/TreeDB/env_vlog_overrides_test.go index 019b46ef0..27e9deeee 100644 --- a/TreeDB/env_vlog_overrides_test.go +++ b/TreeDB/env_vlog_overrides_test.go @@ -153,3 +153,16 @@ func TestApplyEnvMaintenanceOverrides_VlogDictClassModeDefaultAlias(t *testing.T t.Fatalf("expected dict class mode single for default alias, got %v", got) } } + +func TestApplyEnvMaintenanceOverrides_VlogRetainedCaps(t *testing.T) { + opts := Options{} + t.Setenv(envVlogMaxRetainedBytes, "123456") + t.Setenv(envVlogMaxRetainedBytesHard, "654321") + applyEnvMaintenanceOverrides(&opts) + if got := opts.ValueLog.MaxRetainedBytes; got != 123456 { + t.Fatalf("expected max retained bytes=123456, got %d", got) + } + if got := opts.ValueLog.MaxRetainedBytesHard; got != 654321 { + t.Fatalf("expected max retained bytes hard=654321, got %d", got) + } +} diff --git a/TreeDB/public.go b/TreeDB/public.go index 619c9051f..66a853e7a 100644 --- a/TreeDB/public.go +++ b/TreeDB/public.go @@ -710,6 +710,8 @@ const ( envVlogDictEntropy = "TREEDB_VLOG_DICT_ENTROPY" // bool envVlogDictAdaptiveRatio = "TREEDB_VLOG_DICT_ADAPTIVE_RATIO" // float64 envVlogDictMinPayloadSavings = "TREEDB_VLOG_DICT_MIN_PAYLOAD_SAVINGS_RATIO" // float64 + envVlogMaxRetainedBytes = "TREEDB_VLOG_MAX_RETAINED_BYTES" // int64 + envVlogMaxRetainedBytesHard = "TREEDB_VLOG_MAX_RETAINED_BYTES_HARD" // int64 ) func applyEnvMaintenanceOverrides(opts *Options) { @@ -828,6 +830,12 @@ func applyEnvMaintenanceOverrides(opts *Options) { if v, ok := envFloat64(envVlogDictMinPayloadSavings); ok { opts.ValueLog.DictMinPayloadSavingsRatio = v } + if v, ok := envInt(envVlogMaxRetainedBytes); ok { + opts.ValueLog.MaxRetainedBytes = int64(v) + } + if v, ok := envInt(envVlogMaxRetainedBytesHard); ok { + opts.ValueLog.MaxRetainedBytesHard = int64(v) + } } func computeDurabilityMode(opts Options) string { From 3b30044b1597faac4a064dd02f55def67ede6c48 Mon Sep 17 00:00:00 2001 From: Mikers Date: Fri, 27 Mar 2026 23:41:33 -1000 Subject: [PATCH 16/61] treedb: add retained-prune reason counters --- TreeDB/caching/db.go | 103 ++++++++++++++++++++++++++++++++++++-- TreeDB/caching/db_test.go | 8 ++- 2 files changed, 105 insertions(+), 6 deletions(-) diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go index 43293f422..6ecb5a7f0 100644 --- a/TreeDB/caching/db.go +++ b/TreeDB/caching/db.go @@ -4196,6 +4196,16 @@ type valueLogSetRefresher interface { type retainedValueLogPruneStats struct { RemovedSegments int RemovedBytes int64 + InUseSkippedSegments int + InUseSkippedBytes int64 + CandidateSegments int + CandidateBytes int64 + LiveSkippedSegments int + LiveSkippedBytes int64 + ParseSkippedSegments int + ParseSkippedBytes int64 + ZombieMarkedSegments int + ZombieMarkedBytes int64 AbortedForegroundWrites bool RetriedWithoutWriteGate bool RetrySucceeded bool @@ -4233,12 +4243,20 @@ func (db *DB) pruneRetainedValueLogs(force bool) retainedValueLogPruneStats { inUse[path] = struct{}{} } - candidatePaths := make([]string, 0, len(paths)) + type pruneCandidate struct { + path string + size int64 + } + candidatePaths := make([]pruneCandidate, 0, len(paths)) for _, path := range paths { + size := db.valueLogClosedSegmentSize(path) if _, ok := inUse[path]; ok { + out.InUseSkippedSegments++ + if size > 0 { + out.InUseSkippedBytes += size + } continue } - size := db.valueLogClosedSegmentSize(path) if db.cleanupMissingRetainedValueLog(path) { if size > 0 { out.RemovedSegments++ @@ -4246,7 +4264,11 @@ func (db *DB) pruneRetainedValueLogs(force bool) retainedValueLogPruneStats { } continue } - candidatePaths = append(candidatePaths, path) + out.CandidateSegments++ + if size > 0 { + out.CandidateBytes += size + } + candidatePaths = append(candidatePaths, pruneCandidate{path: path, size: size}) } if len(candidatePaths) == 0 { return out @@ -4271,20 +4293,37 @@ func (db *DB) pruneRetainedValueLogs(force bool) retainedValueLogPruneStats { removed := false marked := false - for _, path := range candidatePaths { - size := db.valueLogClosedSegmentSize(path) + for _, candidate := range candidatePaths { + path := candidate.path + size := candidate.size laneID, seq, valueLog, ok := parseLogSeq(filepath.Base(path)) if !ok || !valueLog { + out.ParseSkippedSegments++ + if size > 0 { + out.ParseSkippedBytes += size + } continue } if laneID < 0 { + out.ParseSkippedSegments++ + if size > 0 { + out.ParseSkippedBytes += size + } continue } id, err := valuelog.EncodeFileID(uint32(laneID), uint32(seq)) if err != nil { + out.ParseSkippedSegments++ + if size > 0 { + out.ParseSkippedBytes += size + } continue } if _, ok := live[id]; ok { + out.LiveSkippedSegments++ + if size > 0 { + out.LiveSkippedBytes += size + } continue } @@ -4311,6 +4350,10 @@ func (db *DB) pruneRetainedValueLogs(force bool) retainedValueLogPruneStats { db.reportError(fmt.Errorf("cachingdb: failed to mark value-log %d zombie: %w", id, err)) continue } + out.ZombieMarkedSegments++ + if size > 0 { + out.ZombieMarkedBytes += size + } marked = true } else { db.dropValueLogSegment(path) @@ -4518,6 +4561,36 @@ func (db *DB) scheduleRetainedValueLogPruneWithForce(force bool) { if pruneStats.RemovedBytes > 0 { db.retainedValueLogPruneRemovedBytes.Add(uint64(pruneStats.RemovedBytes)) } + if pruneStats.InUseSkippedSegments > 0 { + db.retainedValueLogPruneInUseSkippedSegments.Add(uint64(pruneStats.InUseSkippedSegments)) + } + if pruneStats.InUseSkippedBytes > 0 { + db.retainedValueLogPruneInUseSkippedBytes.Add(uint64(pruneStats.InUseSkippedBytes)) + } + if pruneStats.CandidateSegments > 0 { + db.retainedValueLogPruneCandidateSegments.Add(uint64(pruneStats.CandidateSegments)) + } + if pruneStats.CandidateBytes > 0 { + db.retainedValueLogPruneCandidateBytes.Add(uint64(pruneStats.CandidateBytes)) + } + if pruneStats.LiveSkippedSegments > 0 { + db.retainedValueLogPruneLiveSkippedSegments.Add(uint64(pruneStats.LiveSkippedSegments)) + } + if pruneStats.LiveSkippedBytes > 0 { + db.retainedValueLogPruneLiveSkippedBytes.Add(uint64(pruneStats.LiveSkippedBytes)) + } + if pruneStats.ParseSkippedSegments > 0 { + db.retainedValueLogPruneParseSkippedSegments.Add(uint64(pruneStats.ParseSkippedSegments)) + } + if pruneStats.ParseSkippedBytes > 0 { + db.retainedValueLogPruneParseSkippedBytes.Add(uint64(pruneStats.ParseSkippedBytes)) + } + if pruneStats.ZombieMarkedSegments > 0 { + db.retainedValueLogPruneZombieMarkedSegments.Add(uint64(pruneStats.ZombieMarkedSegments)) + } + if pruneStats.ZombieMarkedBytes > 0 { + db.retainedValueLogPruneZombieMarkedBytes.Add(uint64(pruneStats.ZombieMarkedBytes)) + } }() } @@ -5315,6 +5388,16 @@ type DB struct { retainedValueLogPruneForegroundAbortRuns atomic.Uint64 retainedValueLogPruneRemovedSegments atomic.Uint64 retainedValueLogPruneRemovedBytes atomic.Uint64 + retainedValueLogPruneInUseSkippedSegments atomic.Uint64 + retainedValueLogPruneInUseSkippedBytes atomic.Uint64 + retainedValueLogPruneCandidateSegments atomic.Uint64 + retainedValueLogPruneCandidateBytes atomic.Uint64 + retainedValueLogPruneLiveSkippedSegments atomic.Uint64 + retainedValueLogPruneLiveSkippedBytes atomic.Uint64 + retainedValueLogPruneParseSkippedSegments atomic.Uint64 + retainedValueLogPruneParseSkippedBytes atomic.Uint64 + retainedValueLogPruneZombieMarkedSegments atomic.Uint64 + retainedValueLogPruneZombieMarkedBytes atomic.Uint64 retainedValueLogPruneScheduleRequests atomic.Uint64 retainedValueLogPruneScheduleForcedRequests atomic.Uint64 retainedValueLogPruneScheduleSkipClosing atomic.Uint64 @@ -19964,6 +20047,16 @@ func (db *DB) Stats() map[string]string { stats["treedb.cache.vlog_retained_prune.foreground_abort_runs"] = fmt.Sprintf("%d", db.retainedValueLogPruneForegroundAbortRuns.Load()) stats["treedb.cache.vlog_retained_prune.removed_segments"] = fmt.Sprintf("%d", db.retainedValueLogPruneRemovedSegments.Load()) stats["treedb.cache.vlog_retained_prune.removed_bytes"] = fmt.Sprintf("%d", db.retainedValueLogPruneRemovedBytes.Load()) + stats["treedb.cache.vlog_retained_prune.in_use_skipped_segments"] = fmt.Sprintf("%d", db.retainedValueLogPruneInUseSkippedSegments.Load()) + stats["treedb.cache.vlog_retained_prune.in_use_skipped_bytes"] = fmt.Sprintf("%d", db.retainedValueLogPruneInUseSkippedBytes.Load()) + stats["treedb.cache.vlog_retained_prune.candidate_segments"] = fmt.Sprintf("%d", db.retainedValueLogPruneCandidateSegments.Load()) + stats["treedb.cache.vlog_retained_prune.candidate_bytes"] = fmt.Sprintf("%d", db.retainedValueLogPruneCandidateBytes.Load()) + stats["treedb.cache.vlog_retained_prune.live_skipped_segments"] = fmt.Sprintf("%d", db.retainedValueLogPruneLiveSkippedSegments.Load()) + stats["treedb.cache.vlog_retained_prune.live_skipped_bytes"] = fmt.Sprintf("%d", db.retainedValueLogPruneLiveSkippedBytes.Load()) + stats["treedb.cache.vlog_retained_prune.parse_skipped_segments"] = fmt.Sprintf("%d", db.retainedValueLogPruneParseSkippedSegments.Load()) + stats["treedb.cache.vlog_retained_prune.parse_skipped_bytes"] = fmt.Sprintf("%d", db.retainedValueLogPruneParseSkippedBytes.Load()) + stats["treedb.cache.vlog_retained_prune.zombie_marked_segments"] = fmt.Sprintf("%d", db.retainedValueLogPruneZombieMarkedSegments.Load()) + stats["treedb.cache.vlog_retained_prune.zombie_marked_bytes"] = fmt.Sprintf("%d", db.retainedValueLogPruneZombieMarkedBytes.Load()) stats["treedb.cache.vlog_retained_prune.pressure_bytes"] = fmt.Sprintf("%d", db.retainedPrunePressureBytes()) stats["treedb.cache.vlog_retained_prune.schedule_requests"] = fmt.Sprintf("%d", db.retainedValueLogPruneScheduleRequests.Load()) stats["treedb.cache.vlog_retained_prune.schedule_forced_requests"] = fmt.Sprintf("%d", db.retainedValueLogPruneScheduleForcedRequests.Load()) diff --git a/TreeDB/caching/db_test.go b/TreeDB/caching/db_test.go index 3f8c3151f..9a2a4a024 100644 --- a/TreeDB/caching/db_test.go +++ b/TreeDB/caching/db_test.go @@ -2027,7 +2027,7 @@ func TestPruneRetainedValueLogs_SkipsLiveScanWhenAllRetainedPathsInUse(t *testin } cache.markValueLogRetain(retained) - cache.pruneRetainedValueLogs(false) + pruneStats := cache.pruneRetainedValueLogs(false) backend.mu.RLock() iteratorCalls := backend.iteratorCalls @@ -2038,6 +2038,12 @@ func TestPruneRetainedValueLogs_SkipsLiveScanWhenAllRetainedPathsInUse(t *testin if !cache.valueLogRetained(retained) { t.Fatalf("expected in-use retained path to remain retained") } + if pruneStats.InUseSkippedSegments != 1 { + t.Fatalf("InUseSkippedSegments=%d want 1", pruneStats.InUseSkippedSegments) + } + if pruneStats.CandidateSegments != 0 { + t.Fatalf("CandidateSegments=%d want 0", pruneStats.CandidateSegments) + } } func seedRetainedPrunePressure(cache *DB, retainedPath string, size int64) { From 8c101d6580e1a61c4eac9ca742cb66ca2bdeb0b7 Mon Sep 17 00:00:00 2001 From: Mikers Date: Fri, 27 Mar 2026 23:53:16 -1000 Subject: [PATCH 17/61] treedb: instrument rewrite-plan empty reasons --- TreeDB/caching/db.go | 30 +++++++++++ .../caching/vlog_generation_scheduler_test.go | 50 +++++++++++++++++++ 2 files changed, 80 insertions(+) diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go index 6ecb5a7f0..77eb3f2c3 100644 --- a/TreeDB/caching/db.go +++ b/TreeDB/caching/db.go @@ -5424,11 +5424,16 @@ type DB struct { vlogGenerationRewritePlanCanceled atomic.Uint64 vlogGenerationRewritePlanErrors atomic.Uint64 vlogGenerationRewritePlanEmpty atomic.Uint64 + vlogGenerationRewritePlanEmptyAgeBlocked atomic.Uint64 + vlogGenerationRewritePlanEmptyNoSelection atomic.Uint64 vlogGenerationRewritePlanSelected atomic.Uint64 vlogGenerationRewritePlanSelectedSegments atomic.Uint64 vlogGenerationRewritePlanSelectedBytes atomic.Uint64 vlogGenerationRewritePlanSelectedLiveBytes atomic.Uint64 vlogGenerationRewritePlanSelectedStaleBytes atomic.Uint64 + vlogGenerationRewritePlanPenaltyFilterRuns atomic.Uint64 + vlogGenerationRewritePlanPenaltyFilterSegments atomic.Uint64 + vlogGenerationRewritePlanPenaltyFilterToEmpty atomic.Uint64 vlogGenerationRewritePlanCanceledLastNS atomic.Int64 vlogGenerationRewriteAgeBlockedUntilNS atomic.Int64 vlogGenerationRewriteAgeBlockedWakeRunning atomic.Bool @@ -12880,6 +12885,22 @@ func (db *DB) observeVlogGenerationRewritePlanOutcomeWithDuration(plan backenddb return } db.vlogGenerationRewritePlanEmpty.Add(1) + if plan.AgeBlockedSegments > 0 && plan.AgeBlockedMinRemainingAge > 0 { + db.vlogGenerationRewritePlanEmptyAgeBlocked.Add(1) + } else { + db.vlogGenerationRewritePlanEmptyNoSelection.Add(1) + } +} + +func (db *DB) observeVlogGenerationRewritePlanPenaltyFilter(before, after int) { + if db == nil || before <= 0 || after >= before { + return + } + db.vlogGenerationRewritePlanPenaltyFilterRuns.Add(1) + db.vlogGenerationRewritePlanPenaltyFilterSegments.Add(uint64(before - after)) + if after == 0 { + db.vlogGenerationRewritePlanPenaltyFilterToEmpty.Add(1) + } } func isVlogGenerationPlannerCanceled(err error) bool { @@ -13785,6 +13806,7 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog } } else if len(plan.SourceFileIDs) > 0 { db.clearVlogGenerationRewriteAgeBlockedUntil() + beforePenaltyFilter := len(plan.SourceFileIDs) plan, err = db.filterVlogGenerationRewritePlanPenalties(plan, now) if err != nil { db.vlogGenerationSchedulerState.Store(vlogGenerationSchedulerError) @@ -13793,6 +13815,7 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog } return } + db.observeVlogGenerationRewritePlanPenaltyFilter(beforePenaltyFilter, len(plan.SourceFileIDs)) updatePlanTimestamp = true if len(plan.SourceFileIDs) > 0 { if stagePending { @@ -13940,6 +13963,7 @@ planned: } if len(plan.SourceFileIDs) > 0 { db.clearVlogGenerationRewriteAgeBlockedUntil() + beforePenaltyFilter := len(plan.SourceFileIDs) plan, err = db.filterVlogGenerationRewritePlanPenalties(plan, now) if err != nil { db.vlogGenerationSchedulerState.Store(vlogGenerationSchedulerError) @@ -13949,6 +13973,7 @@ planned: } return } + db.observeVlogGenerationRewritePlanPenaltyFilter(beforePenaltyFilter, len(plan.SourceFileIDs)) } if len(plan.SourceFileIDs) == 0 { if shouldDeferVlogGenerationRewritePlanForAge(plan, vlogGenerationRewriteMinSegmentAge) { @@ -20162,11 +20187,16 @@ func (db *DB) Stats() map[string]string { stats["treedb.cache.vlog_generation.rewrite.plan_canceled_last_unix_nano"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanCanceledLastNS.Load()) stats["treedb.cache.vlog_generation.rewrite.plan_errors"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanErrors.Load()) stats["treedb.cache.vlog_generation.rewrite.plan_empty"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanEmpty.Load()) + stats["treedb.cache.vlog_generation.rewrite.plan_empty.age_blocked"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanEmptyAgeBlocked.Load()) + stats["treedb.cache.vlog_generation.rewrite.plan_empty.no_selection"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanEmptyNoSelection.Load()) stats["treedb.cache.vlog_generation.rewrite.plan_selected"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanSelected.Load()) stats["treedb.cache.vlog_generation.rewrite.plan_selected_segments_total"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanSelectedSegments.Load()) stats["treedb.cache.vlog_generation.rewrite.plan_selected_bytes_total"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanSelectedBytes.Load()) stats["treedb.cache.vlog_generation.rewrite.plan_selected_bytes_live"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanSelectedLiveBytes.Load()) stats["treedb.cache.vlog_generation.rewrite.plan_selected_bytes_stale"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanSelectedStaleBytes.Load()) + stats["treedb.cache.vlog_generation.rewrite.plan_penalty_filter.runs"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanPenaltyFilterRuns.Load()) + stats["treedb.cache.vlog_generation.rewrite.plan_penalty_filter.segments"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanPenaltyFilterSegments.Load()) + stats["treedb.cache.vlog_generation.rewrite.plan_penalty_filter.to_empty_runs"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanPenaltyFilterToEmpty.Load()) stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_total"] = fmt.Sprintf("%d", db.vlogGenerationRewriteExecSourceSegments.Load()) stats["treedb.cache.vlog_generation.rewrite.canceled_runs"] = fmt.Sprintf("%d", db.vlogGenerationRewriteCanceledRuns.Load()) stats["treedb.cache.vlog_generation.rewrite.canceled_last_unix_nano"] = fmt.Sprintf("%d", db.vlogGenerationRewriteCanceledLastNS.Load()) diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go index 4bb6a8912..0d203c381 100644 --- a/TreeDB/caching/vlog_generation_scheduler_test.go +++ b/TreeDB/caching/vlog_generation_scheduler_test.go @@ -320,6 +320,41 @@ func TestObserveVlogGenerationRewritePlanOutcome_SelectedTracksSegmentFallbackBy } } +func TestObserveVlogGenerationRewritePlanOutcome_EmptyReasonBuckets(t *testing.T) { + db := &DB{} + db.observeVlogGenerationRewritePlanOutcome(backenddb.ValueLogRewritePlan{ + AgeBlockedSegments: 2, + AgeBlockedMinRemainingAge: 3 * time.Second, + }, nil) + db.observeVlogGenerationRewritePlanOutcome(backenddb.ValueLogRewritePlan{}, nil) + + if got, want := db.vlogGenerationRewritePlanEmpty.Load(), uint64(2); got != want { + t.Fatalf("plan empty=%d want=%d", got, want) + } + if got, want := db.vlogGenerationRewritePlanEmptyAgeBlocked.Load(), uint64(1); got != want { + t.Fatalf("plan empty age-blocked=%d want=%d", got, want) + } + if got, want := db.vlogGenerationRewritePlanEmptyNoSelection.Load(), uint64(1); got != want { + t.Fatalf("plan empty no-selection=%d want=%d", got, want) + } +} + +func TestObserveVlogGenerationRewritePlanPenaltyFilterCounters(t *testing.T) { + db := &DB{} + db.observeVlogGenerationRewritePlanPenaltyFilter(5, 2) + db.observeVlogGenerationRewritePlanPenaltyFilter(2, 0) + + if got, want := db.vlogGenerationRewritePlanPenaltyFilterRuns.Load(), uint64(2); got != want { + t.Fatalf("penalty filter runs=%d want=%d", got, want) + } + if got, want := db.vlogGenerationRewritePlanPenaltyFilterSegments.Load(), uint64(5); got != want { + t.Fatalf("penalty filter segments=%d want=%d", got, want) + } + if got, want := db.vlogGenerationRewritePlanPenaltyFilterToEmpty.Load(), uint64(1); got != want { + t.Fatalf("penalty filter to-empty=%d want=%d", got, want) + } +} + func TestMaybeRunVlogGenerationMaintenanceWithOptions_TracksWalOnPeriodicSkip(t *testing.T) { db := &DB{valueLogGenerationPolicy: uint8(backenddb.ValueLogGenerationHotWarmCold)} db.maybeRunVlogGenerationMaintenanceWithOptions(true, vlogGenerationMaintenanceOptions{}) @@ -4559,6 +4594,12 @@ func TestVlogGenerationRewritePlan_TracksEmptyPlanOutcome(t *testing.T) { if got := stats["treedb.cache.vlog_generation.rewrite.plan_empty"]; got != "1" { t.Fatalf("plan empty=%q want 1", got) } + if got := stats["treedb.cache.vlog_generation.rewrite.plan_empty.age_blocked"]; got != "0" { + t.Fatalf("plan empty age-blocked=%q want 0", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.plan_empty.no_selection"]; got != "1" { + t.Fatalf("plan empty no-selection=%q want 1", got) + } if got := stats["treedb.cache.vlog_generation.rewrite.plan_selected"]; got != "0" { t.Fatalf("plan selected=%q want 0", got) } @@ -4568,6 +4609,15 @@ func TestVlogGenerationRewritePlan_TracksEmptyPlanOutcome(t *testing.T) { if got := stats["treedb.cache.vlog_generation.rewrite.plan_errors"]; got != "0" { t.Fatalf("plan errors=%q want 0", got) } + if got := stats["treedb.cache.vlog_generation.rewrite.plan_penalty_filter.runs"]; got != "0" { + t.Fatalf("plan penalty-filter runs=%q want 0", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.plan_penalty_filter.segments"]; got != "0" { + t.Fatalf("plan penalty-filter segments=%q want 0", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.plan_penalty_filter.to_empty_runs"]; got != "0" { + t.Fatalf("plan penalty-filter to-empty=%q want 0", got) + } } func TestVlogGenerationRewritePlan_RunsOutsideMaintenanceBarrier(t *testing.T) { From 046622297ba4ddc758083065fcaa82598dcfba51 Mon Sep 17 00:00:00 2001 From: Mikers Date: Sat, 28 Mar 2026 00:08:36 -1000 Subject: [PATCH 18/61] treedb: add env overrides for rewrite budget and triggers --- TreeDB/env_vlog_overrides_test.go | 27 +++++++++++++++ TreeDB/public.go | 55 ++++++++++++++++++++++--------- 2 files changed, 66 insertions(+), 16 deletions(-) diff --git a/TreeDB/env_vlog_overrides_test.go b/TreeDB/env_vlog_overrides_test.go index 27e9deeee..eeebdc1a5 100644 --- a/TreeDB/env_vlog_overrides_test.go +++ b/TreeDB/env_vlog_overrides_test.go @@ -166,3 +166,30 @@ func TestApplyEnvMaintenanceOverrides_VlogRetainedCaps(t *testing.T) { t.Fatalf("expected max retained bytes hard=654321, got %d", got) } } + +func TestApplyEnvMaintenanceOverrides_VlogRewriteControls(t *testing.T) { + opts := Options{} + t.Setenv(envVlogRewriteBudgetBytesPerSec, "123456789") + t.Setenv(envVlogRewriteBudgetRecordsPerSec, "4321") + t.Setenv(envVlogRewriteTriggerTotalBytes, "987654321") + t.Setenv(envVlogRewriteTriggerStaleRatioPPM, "345678") + t.Setenv(envVlogRewriteTriggerChurnPerSec, "13579") + applyEnvMaintenanceOverrides(&opts) + + gen := opts.ValueLog.Generational + if got := gen.RewriteBudgetBytesPerSec; got != 123456789 { + t.Fatalf("expected rewrite budget bytes/sec=123456789, got %d", got) + } + if got := gen.RewriteBudgetRecordsPerSec; got != 4321 { + t.Fatalf("expected rewrite budget records/sec=4321, got %d", got) + } + if got := gen.RewriteTriggerTotalBytes; got != 987654321 { + t.Fatalf("expected rewrite trigger total bytes=987654321, got %d", got) + } + if got := gen.RewriteTriggerStaleRatioPPM; got != 345678 { + t.Fatalf("expected rewrite trigger stale ratio ppm=345678, got %d", got) + } + if got := gen.RewriteTriggerChurnPerSec; got != 13579 { + t.Fatalf("expected rewrite trigger churn/sec=13579, got %d", got) + } +} diff --git a/TreeDB/public.go b/TreeDB/public.go index 66a853e7a..2af25ada1 100644 --- a/TreeDB/public.go +++ b/TreeDB/public.go @@ -696,22 +696,27 @@ const ( // - Dict training enabled (TrainBytes > 0), and // - Side stores enabled (dictdb), and // - Split value log enabled (value pointers used). - envVlogDictEnable = "TREEDB_VLOG_DICT_ENABLE" // bool - envVlogDictTrainBytes = "TREEDB_VLOG_DICT_TRAIN_BYTES" // int - envVlogDictBytes = "TREEDB_VLOG_DICT_BYTES" // int - envVlogDictMinRecords = "TREEDB_VLOG_DICT_MIN_RECORDS" // int - envVlogDictMaxRecordBytes = "TREEDB_VLOG_DICT_MAX_RECORD_BYTES" // int - envVlogDictSampleStride = "TREEDB_VLOG_DICT_SAMPLE_STRIDE" // int - envVlogDictDedupWindow = "TREEDB_VLOG_DICT_DEDUP_WINDOW" // int - envVlogDictTrainLevel = "TREEDB_VLOG_DICT_TRAIN_LEVEL" // int - envVlogDictMaxK = "TREEDB_VLOG_DICT_MAX_K" // int - envVlogDictClassMode = "TREEDB_VLOG_DICT_CLASS_MODE" // single|split_outer_leaf - envVlogDictZstdLevel = "TREEDB_VLOG_DICT_ZSTD_LEVEL" // fastest|default|better|best|int - envVlogDictEntropy = "TREEDB_VLOG_DICT_ENTROPY" // bool - envVlogDictAdaptiveRatio = "TREEDB_VLOG_DICT_ADAPTIVE_RATIO" // float64 - envVlogDictMinPayloadSavings = "TREEDB_VLOG_DICT_MIN_PAYLOAD_SAVINGS_RATIO" // float64 - envVlogMaxRetainedBytes = "TREEDB_VLOG_MAX_RETAINED_BYTES" // int64 - envVlogMaxRetainedBytesHard = "TREEDB_VLOG_MAX_RETAINED_BYTES_HARD" // int64 + envVlogDictEnable = "TREEDB_VLOG_DICT_ENABLE" // bool + envVlogDictTrainBytes = "TREEDB_VLOG_DICT_TRAIN_BYTES" // int + envVlogDictBytes = "TREEDB_VLOG_DICT_BYTES" // int + envVlogDictMinRecords = "TREEDB_VLOG_DICT_MIN_RECORDS" // int + envVlogDictMaxRecordBytes = "TREEDB_VLOG_DICT_MAX_RECORD_BYTES" // int + envVlogDictSampleStride = "TREEDB_VLOG_DICT_SAMPLE_STRIDE" // int + envVlogDictDedupWindow = "TREEDB_VLOG_DICT_DEDUP_WINDOW" // int + envVlogDictTrainLevel = "TREEDB_VLOG_DICT_TRAIN_LEVEL" // int + envVlogDictMaxK = "TREEDB_VLOG_DICT_MAX_K" // int + envVlogDictClassMode = "TREEDB_VLOG_DICT_CLASS_MODE" // single|split_outer_leaf + envVlogDictZstdLevel = "TREEDB_VLOG_DICT_ZSTD_LEVEL" // fastest|default|better|best|int + envVlogDictEntropy = "TREEDB_VLOG_DICT_ENTROPY" // bool + envVlogDictAdaptiveRatio = "TREEDB_VLOG_DICT_ADAPTIVE_RATIO" // float64 + envVlogDictMinPayloadSavings = "TREEDB_VLOG_DICT_MIN_PAYLOAD_SAVINGS_RATIO" // float64 + envVlogMaxRetainedBytes = "TREEDB_VLOG_MAX_RETAINED_BYTES" // int64 + envVlogMaxRetainedBytesHard = "TREEDB_VLOG_MAX_RETAINED_BYTES_HARD" // int64 + envVlogRewriteBudgetBytesPerSec = "TREEDB_VLOG_REWRITE_BUDGET_BYTES_PER_SEC" // int64 + envVlogRewriteBudgetRecordsPerSec = "TREEDB_VLOG_REWRITE_BUDGET_RECORDS_PER_SEC" // int + envVlogRewriteTriggerTotalBytes = "TREEDB_VLOG_REWRITE_TRIGGER_TOTAL_BYTES" // int64 + envVlogRewriteTriggerStaleRatioPPM = "TREEDB_VLOG_REWRITE_TRIGGER_STALE_RATIO_PPM" // uint32 + envVlogRewriteTriggerChurnPerSec = "TREEDB_VLOG_REWRITE_TRIGGER_CHURN_PER_SEC" // int64 ) func applyEnvMaintenanceOverrides(opts *Options) { @@ -836,6 +841,24 @@ func applyEnvMaintenanceOverrides(opts *Options) { if v, ok := envInt(envVlogMaxRetainedBytesHard); ok { opts.ValueLog.MaxRetainedBytesHard = int64(v) } + if v, ok := envInt(envVlogRewriteBudgetBytesPerSec); ok { + opts.ValueLog.Generational.RewriteBudgetBytesPerSec = int64(v) + } + if v, ok := envInt(envVlogRewriteBudgetRecordsPerSec); ok { + opts.ValueLog.Generational.RewriteBudgetRecordsPerSec = v + } + if v, ok := envInt(envVlogRewriteTriggerTotalBytes); ok { + opts.ValueLog.Generational.RewriteTriggerTotalBytes = int64(v) + } + if v, ok := envInt(envVlogRewriteTriggerStaleRatioPPM); ok { + if v < 0 { + v = 0 + } + opts.ValueLog.Generational.RewriteTriggerStaleRatioPPM = uint32(v) + } + if v, ok := envInt(envVlogRewriteTriggerChurnPerSec); ok { + opts.ValueLog.Generational.RewriteTriggerChurnPerSec = int64(v) + } } func computeDurabilityMode(opts Options) string { From 6ee065be5b35c949211ac290616e2d13b2d691bb Mon Sep 17 00:00:00 2001 From: Mikers Date: Sat, 28 Mar 2026 00:26:11 -1000 Subject: [PATCH 19/61] treedb: split rewrite cancel metrics by fresh vs queued debt --- TreeDB/caching/db.go | 37 +++++++++++++++- .../caching/vlog_generation_scheduler_test.go | 44 +++++++++++++++++++ 2 files changed, 79 insertions(+), 2 deletions(-) diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go index 77eb3f2c3..ca27de39b 100644 --- a/TreeDB/caching/db.go +++ b/TreeDB/caching/db.go @@ -5442,7 +5442,13 @@ type DB struct { vlogGenerationRewriteIneffectiveBytesIn atomic.Uint64 vlogGenerationRewriteIneffectiveBytesOut atomic.Uint64 vlogGenerationRewriteCanceledRuns atomic.Uint64 + vlogGenerationRewriteCanceledFreshPlanRuns atomic.Uint64 + vlogGenerationRewriteCanceledQueuedDebtRuns atomic.Uint64 vlogGenerationRewriteCanceledLastNS atomic.Int64 + vlogGenerationRewriteDeadlineRuns atomic.Uint64 + vlogGenerationRewriteDeadlineFreshPlanRuns atomic.Uint64 + vlogGenerationRewriteDeadlineQueuedDebtRuns atomic.Uint64 + vlogGenerationRewriteDeadlineLastNS atomic.Int64 vlogGenerationRewriteQueuePruneRuns atomic.Uint64 vlogGenerationRewriteQueuePruneIDs atomic.Uint64 vlogGenerationGCSegmentsDeleted atomic.Uint64 @@ -12918,14 +12924,32 @@ func (db *DB) vlogGenerationRewritePlanBackoffActive(now time.Time) bool { return now.Sub(time.Unix(0, lastCanceled)) < vlogGenerationRewritePlanCancelBackoff } -func (db *DB) observeVlogGenerationRewriteCanceled() { +func (db *DB) observeVlogGenerationRewriteCanceled(queuedDebt bool) { if db == nil { return } db.vlogGenerationRewriteCanceledRuns.Add(1) + if queuedDebt { + db.vlogGenerationRewriteCanceledQueuedDebtRuns.Add(1) + } else { + db.vlogGenerationRewriteCanceledFreshPlanRuns.Add(1) + } db.vlogGenerationRewriteCanceledLastNS.Store(time.Now().UnixNano()) } +func (db *DB) observeVlogGenerationRewriteDeadline(queuedDebt bool) { + if db == nil { + return + } + db.vlogGenerationRewriteDeadlineRuns.Add(1) + if queuedDebt { + db.vlogGenerationRewriteDeadlineQueuedDebtRuns.Add(1) + } else { + db.vlogGenerationRewriteDeadlineFreshPlanRuns.Add(1) + } + db.vlogGenerationRewriteDeadlineLastNS.Store(time.Now().UnixNano()) +} + func (db *DB) observeVlogGenerationRewriteQueuePrune(dropped int) { if db == nil || dropped <= 0 { return @@ -14227,14 +14251,17 @@ planned: db.observeVlogGenerationRewriteExecDuration(rewriteDur) if err != nil { db.debugVlogMaintf("rewrite_err reason=%s err=%v dur_ms=%.3f", vlogGenerationReasonString(reason), err, float64(rewriteDur.Microseconds())/1000) + queuedDebt := hadRewriteQueue && len(processedRewriteIDs) > 0 if errors.Is(err, context.Canceled) { - db.observeVlogGenerationRewriteCanceled() + db.observeVlogGenerationRewriteCanceled(queuedDebt) if len(processedRewriteIDs) > 0 { // A canceled rewrite that already selected a queued chunk should // immediately queue a checkpoint-kick retry. The retry executes // as resumable debt with bounded non-cancel semantics. db.vlogGenerationCheckpointKickPending.Store(true) } + } else if errors.Is(err, context.DeadlineExceeded) { + db.observeVlogGenerationRewriteDeadline(queuedDebt) } return fmt.Errorf("generational rewrite: %w", err) } @@ -20199,7 +20226,13 @@ func (db *DB) Stats() map[string]string { stats["treedb.cache.vlog_generation.rewrite.plan_penalty_filter.to_empty_runs"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanPenaltyFilterToEmpty.Load()) stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_total"] = fmt.Sprintf("%d", db.vlogGenerationRewriteExecSourceSegments.Load()) stats["treedb.cache.vlog_generation.rewrite.canceled_runs"] = fmt.Sprintf("%d", db.vlogGenerationRewriteCanceledRuns.Load()) + stats["treedb.cache.vlog_generation.rewrite.canceled_runs.fresh_plan"] = fmt.Sprintf("%d", db.vlogGenerationRewriteCanceledFreshPlanRuns.Load()) + stats["treedb.cache.vlog_generation.rewrite.canceled_runs.queued_debt"] = fmt.Sprintf("%d", db.vlogGenerationRewriteCanceledQueuedDebtRuns.Load()) stats["treedb.cache.vlog_generation.rewrite.canceled_last_unix_nano"] = fmt.Sprintf("%d", db.vlogGenerationRewriteCanceledLastNS.Load()) + stats["treedb.cache.vlog_generation.rewrite.deadline_runs"] = fmt.Sprintf("%d", db.vlogGenerationRewriteDeadlineRuns.Load()) + stats["treedb.cache.vlog_generation.rewrite.deadline_runs.fresh_plan"] = fmt.Sprintf("%d", db.vlogGenerationRewriteDeadlineFreshPlanRuns.Load()) + stats["treedb.cache.vlog_generation.rewrite.deadline_runs.queued_debt"] = fmt.Sprintf("%d", db.vlogGenerationRewriteDeadlineQueuedDebtRuns.Load()) + stats["treedb.cache.vlog_generation.rewrite.deadline_last_unix_nano"] = fmt.Sprintf("%d", db.vlogGenerationRewriteDeadlineLastNS.Load()) stats["treedb.cache.vlog_generation.rewrite.queue_prune_runs"] = fmt.Sprintf("%d", db.vlogGenerationRewriteQueuePruneRuns.Load()) stats["treedb.cache.vlog_generation.rewrite.queue_prune_ids"] = fmt.Sprintf("%d", db.vlogGenerationRewriteQueuePruneIDs.Load()) stats["treedb.cache.vlog_generation.rewrite.ineffective_runs"] = fmt.Sprintf("%d", db.vlogGenerationRewriteIneffectiveRuns.Load()) diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go index 0d203c381..530f5df96 100644 --- a/TreeDB/caching/vlog_generation_scheduler_test.go +++ b/TreeDB/caching/vlog_generation_scheduler_test.go @@ -355,6 +355,38 @@ func TestObserveVlogGenerationRewritePlanPenaltyFilterCounters(t *testing.T) { } } +func TestObserveVlogGenerationRewriteCanceledCountersByQueueState(t *testing.T) { + db := &DB{} + db.observeVlogGenerationRewriteCanceled(false) + db.observeVlogGenerationRewriteCanceled(true) + + if got, want := db.vlogGenerationRewriteCanceledRuns.Load(), uint64(2); got != want { + t.Fatalf("rewrite canceled total=%d want=%d", got, want) + } + if got, want := db.vlogGenerationRewriteCanceledFreshPlanRuns.Load(), uint64(1); got != want { + t.Fatalf("rewrite canceled fresh=%d want=%d", got, want) + } + if got, want := db.vlogGenerationRewriteCanceledQueuedDebtRuns.Load(), uint64(1); got != want { + t.Fatalf("rewrite canceled queued=%d want=%d", got, want) + } +} + +func TestObserveVlogGenerationRewriteDeadlineCountersByQueueState(t *testing.T) { + db := &DB{} + db.observeVlogGenerationRewriteDeadline(false) + db.observeVlogGenerationRewriteDeadline(true) + + if got, want := db.vlogGenerationRewriteDeadlineRuns.Load(), uint64(2); got != want { + t.Fatalf("rewrite deadline total=%d want=%d", got, want) + } + if got, want := db.vlogGenerationRewriteDeadlineFreshPlanRuns.Load(), uint64(1); got != want { + t.Fatalf("rewrite deadline fresh=%d want=%d", got, want) + } + if got, want := db.vlogGenerationRewriteDeadlineQueuedDebtRuns.Load(), uint64(1); got != want { + t.Fatalf("rewrite deadline queued=%d want=%d", got, want) + } +} + func TestMaybeRunVlogGenerationMaintenanceWithOptions_TracksWalOnPeriodicSkip(t *testing.T) { db := &DB{valueLogGenerationPolicy: uint8(backenddb.ValueLogGenerationHotWarmCold)} db.maybeRunVlogGenerationMaintenanceWithOptions(true, vlogGenerationMaintenanceOptions{}) @@ -978,6 +1010,12 @@ func TestVlogGenerationRewrite_QueuedExecIgnoresForegroundCancelUntilBoundedComp if got := stats["treedb.cache.vlog_generation.rewrite.canceled_runs"]; got != "0" { t.Fatalf("rewrite canceled runs=%q want 0 for bounded queued rewrite", got) } + if got := stats["treedb.cache.vlog_generation.rewrite.canceled_runs.fresh_plan"]; got != "0" { + t.Fatalf("rewrite canceled fresh runs=%q want 0 for bounded queued rewrite", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.canceled_runs.queued_debt"]; got != "0" { + t.Fatalf("rewrite canceled queued runs=%q want 0 for bounded queued rewrite", got) + } } func TestVlogGenerationRewrite_CanceledFreshPlanQueuesPendingResume(t *testing.T) { @@ -1086,6 +1124,12 @@ func TestVlogGenerationRewrite_CanceledFreshPlanQueuesPendingResume(t *testing.T if got := stats["treedb.cache.vlog_generation.rewrite.canceled_runs"]; got != "1" { t.Fatalf("rewrite canceled runs=%q want 1", got) } + if got := stats["treedb.cache.vlog_generation.rewrite.canceled_runs.fresh_plan"]; got != "1" { + t.Fatalf("rewrite canceled fresh runs=%q want 1", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.canceled_runs.queued_debt"]; got != "0" { + t.Fatalf("rewrite canceled queued runs=%q want 0", got) + } } func TestVlogGenerationMaintenance_QueuesPendingCheckpointKickOnActiveCollision(t *testing.T) { From fa183f0caa433a87d71148bdf4605e712f1fdbbc Mon Sep 17 00:00:00 2001 From: Mikers Date: Sat, 28 Mar 2026 00:50:13 -1000 Subject: [PATCH 20/61] treedb: bound fresh-plan rewrite exec to avoid foreground preemption --- TreeDB/caching/db.go | 2 +- .../caching/vlog_generation_scheduler_test.go | 48 ++++++++----------- 2 files changed, 21 insertions(+), 29 deletions(-) diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go index ca27de39b..882c1f00c 100644 --- a/TreeDB/caching/db.go +++ b/TreeDB/caching/db.go @@ -14228,7 +14228,7 @@ planned: } var ctx context.Context var cancel context.CancelFunc - if hadRewriteQueue && len(processedRewriteIDs) > 0 { + if len(processedRewriteIDs) > 0 { ctx, cancel = context.WithTimeout(context.Background(), vlogGenerationRewriteBoundedExecTimeout) } else { ctx, cancel = db.foregroundMaintenanceContext(2 * time.Minute) diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go index 530f5df96..ce83ba165 100644 --- a/TreeDB/caching/vlog_generation_scheduler_test.go +++ b/TreeDB/caching/vlog_generation_scheduler_test.go @@ -1018,7 +1018,7 @@ func TestVlogGenerationRewrite_QueuedExecIgnoresForegroundCancelUntilBoundedComp } } -func TestVlogGenerationRewrite_CanceledFreshPlanQueuesPendingResume(t *testing.T) { +func TestVlogGenerationRewrite_FreshPlanExecIgnoresForegroundCancelUntilBoundedComplete(t *testing.T) { prepareDirectSchedulerTest(t) dir := t.TempDir() @@ -1090,42 +1090,34 @@ func TestVlogGenerationRewrite_CanceledFreshPlanQueuesPendingResume(t *testing.T select { case <-done: - case <-time.After(2 * wait): - t.Fatalf("initial rewrite did not cancel under foreground activity") + t.Fatalf("rewrite completed early under foreground activity; expected bounded fresh-plan rewrite to continue until release (ctx_ttl=%s)", blocking.recordedRewriteTTL()) + case <-time.After(250 * time.Millisecond): } - deadline := time.Now().Add(2 * wait) - for blocking.recordedRewriteCalls() < 2 { - if time.Now().After(deadline) { - t.Fatalf("pending checkpoint-kick resume did not run (calls=%d)", blocking.recordedRewriteCalls()) - } - time.Sleep(10 * time.Millisecond) + releaseRewrite() + select { + case <-done: + case <-time.After(2 * wait): + t.Fatalf("rewrite did not finish after release") } if ttl := blocking.recordedRewriteTTL(); ttl < 20*time.Second { - t.Fatalf("resume rewrite context ttl=%s want around %s", ttl, vlogGenerationRewriteBoundedExecTimeout) + t.Fatalf("fresh-plan rewrite context ttl=%s want around %s", ttl, vlogGenerationRewriteBoundedExecTimeout) } - releaseRewrite() - deadline = time.Now().Add(2 * wait) - for { - queue, qerr := db.currentVlogGenerationRewriteQueue() - if qerr != nil { - t.Fatalf("load rewrite queue: %v", qerr) - } - if len(queue) == 0 { - break - } - if time.Now().After(deadline) { - t.Fatalf("rewrite queue not drained after resume release: queue=%v calls=%d", queue, blocking.recordedRewriteCalls()) - } - time.Sleep(10 * time.Millisecond) + queue, qerr := db.currentVlogGenerationRewriteQueue() + if qerr != nil { + t.Fatalf("load rewrite queue: %v", qerr) } + if len(queue) != 0 { + t.Fatalf("rewrite queue not drained after release: queue=%v calls=%d", queue, blocking.recordedRewriteCalls()) + } + stats := db.Stats() - if got := stats["treedb.cache.vlog_generation.rewrite.canceled_runs"]; got != "1" { - t.Fatalf("rewrite canceled runs=%q want 1", got) + if got := stats["treedb.cache.vlog_generation.rewrite.canceled_runs"]; got != "0" { + t.Fatalf("rewrite canceled runs=%q want 0", got) } - if got := stats["treedb.cache.vlog_generation.rewrite.canceled_runs.fresh_plan"]; got != "1" { - t.Fatalf("rewrite canceled fresh runs=%q want 1", got) + if got := stats["treedb.cache.vlog_generation.rewrite.canceled_runs.fresh_plan"]; got != "0" { + t.Fatalf("rewrite canceled fresh runs=%q want 0", got) } if got := stats["treedb.cache.vlog_generation.rewrite.canceled_runs.queued_debt"]; got != "0" { t.Fatalf("rewrite canceled queued runs=%q want 0", got) From d7dca39a6b4d857089a6e268ce8d915e05ed98c3 Mon Sep 17 00:00:00 2001 From: Mikers Date: Sat, 28 Mar 2026 01:23:11 -1000 Subject: [PATCH 21/61] caching: add rewrite source-segment outcome observability --- TreeDB/caching/db.go | 64 +++++++++++++++---- .../caching/vlog_generation_scheduler_test.go | 24 +++++++ TreeDB/db/vlog_rewrite.go | 20 ++++++ TreeDB/db/vlog_rewrite_test.go | 9 +++ TreeDB/vlog_rewrite.go | 13 ++-- 5 files changed, 111 insertions(+), 19 deletions(-) diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go index 882c1f00c..1fb10acd8 100644 --- a/TreeDB/caching/db.go +++ b/TreeDB/caching/db.go @@ -5530,20 +5530,26 @@ type DB struct { // Rewrite budget token bucket (bytes) for online maintenance. This lets us // interpret ValueLogRewriteBudgetBytesPerSec as a true per-second bandwidth // budget while still running maintenance at coarse intervals. - vlogGenerationRewriteBudgetLastUnixNano atomic.Int64 - vlogGenerationRewriteBudgetTokensBytes atomic.Int64 - vlogGenerationRewriteBudgetConsumed atomic.Uint64 - vlogGenerationRewritePlanTotalNanos atomic.Uint64 - vlogGenerationRewritePlanMaxNanos atomic.Uint64 - vlogGenerationRewriteExecTotalNanos atomic.Uint64 - vlogGenerationRewriteExecMaxNanos atomic.Uint64 - vlogGenerationRewriteExecSourceSegments atomic.Uint64 - vlogGenerationGCExecTotalNanos atomic.Uint64 - vlogGenerationGCExecMaxNanos atomic.Uint64 - vlogGenerationVacuumExecTotalNanos atomic.Uint64 - vlogGenerationVacuumExecMaxNanos atomic.Uint64 - bgErrMu sync.Mutex - bgErr error + vlogGenerationRewriteBudgetLastUnixNano atomic.Int64 + vlogGenerationRewriteBudgetTokensBytes atomic.Int64 + vlogGenerationRewriteBudgetConsumed atomic.Uint64 + vlogGenerationRewritePlanTotalNanos atomic.Uint64 + vlogGenerationRewritePlanMaxNanos atomic.Uint64 + vlogGenerationRewriteExecTotalNanos atomic.Uint64 + vlogGenerationRewriteExecMaxNanos atomic.Uint64 + vlogGenerationRewriteExecSourceSegments atomic.Uint64 + vlogGenerationRewriteSourceSegmentsRequestedTotal atomic.Uint64 + vlogGenerationRewriteSourceSegmentsStillReferencedTotal atomic.Uint64 + vlogGenerationRewriteSourceSegmentsUnreferencedTotal atomic.Uint64 + vlogGenerationRewriteSourceSegmentsRequestedLast atomic.Uint64 + vlogGenerationRewriteSourceSegmentsStillReferencedLast atomic.Uint64 + vlogGenerationRewriteSourceSegmentsUnreferencedLast atomic.Uint64 + vlogGenerationGCExecTotalNanos atomic.Uint64 + vlogGenerationGCExecMaxNanos atomic.Uint64 + vlogGenerationVacuumExecTotalNanos atomic.Uint64 + vlogGenerationVacuumExecMaxNanos atomic.Uint64 + bgErrMu sync.Mutex + bgErr error // Backpressure state queueBacklogBytes atomic.Int64 @@ -14408,6 +14414,30 @@ planned: if sourceSegments := len(rewriteOpts.SourceFileIDs); sourceSegments > 0 { db.vlogGenerationRewriteExecSourceSegments.Add(uint64(sourceSegments)) } + sourceSegmentsRequested := uint64(0) + if stats.SourceSegmentsRequested > 0 { + sourceSegmentsRequested = uint64(stats.SourceSegmentsRequested) + } + sourceSegmentsStillReferenced := uint64(0) + if stats.SourceSegmentsStillReferenced > 0 { + sourceSegmentsStillReferenced = uint64(stats.SourceSegmentsStillReferenced) + } + sourceSegmentsUnreferenced := uint64(0) + if stats.SourceSegmentsUnreferenced > 0 { + sourceSegmentsUnreferenced = uint64(stats.SourceSegmentsUnreferenced) + } + db.vlogGenerationRewriteSourceSegmentsRequestedLast.Store(sourceSegmentsRequested) + db.vlogGenerationRewriteSourceSegmentsStillReferencedLast.Store(sourceSegmentsStillReferenced) + db.vlogGenerationRewriteSourceSegmentsUnreferencedLast.Store(sourceSegmentsUnreferenced) + if sourceSegmentsRequested > 0 { + db.vlogGenerationRewriteSourceSegmentsRequestedTotal.Add(sourceSegmentsRequested) + } + if sourceSegmentsStillReferenced > 0 { + db.vlogGenerationRewriteSourceSegmentsStillReferencedTotal.Add(sourceSegmentsStillReferenced) + } + if sourceSegmentsUnreferenced > 0 { + db.vlogGenerationRewriteSourceSegmentsUnreferencedTotal.Add(sourceSegmentsUnreferenced) + } rewriteBytesIn := int64(0) if processedLedgerOK { rewriteBytesIn = processedLedgerLiveBytes @@ -20225,6 +20255,12 @@ func (db *DB) Stats() map[string]string { stats["treedb.cache.vlog_generation.rewrite.plan_penalty_filter.segments"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanPenaltyFilterSegments.Load()) stats["treedb.cache.vlog_generation.rewrite.plan_penalty_filter.to_empty_runs"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanPenaltyFilterToEmpty.Load()) stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_total"] = fmt.Sprintf("%d", db.vlogGenerationRewriteExecSourceSegments.Load()) + stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_requested_total"] = fmt.Sprintf("%d", db.vlogGenerationRewriteSourceSegmentsRequestedTotal.Load()) + stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_still_referenced_total"] = fmt.Sprintf("%d", db.vlogGenerationRewriteSourceSegmentsStillReferencedTotal.Load()) + stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_unreferenced_total"] = fmt.Sprintf("%d", db.vlogGenerationRewriteSourceSegmentsUnreferencedTotal.Load()) + stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_requested_last"] = fmt.Sprintf("%d", db.vlogGenerationRewriteSourceSegmentsRequestedLast.Load()) + stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_still_referenced_last"] = fmt.Sprintf("%d", db.vlogGenerationRewriteSourceSegmentsStillReferencedLast.Load()) + stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_unreferenced_last"] = fmt.Sprintf("%d", db.vlogGenerationRewriteSourceSegmentsUnreferencedLast.Load()) stats["treedb.cache.vlog_generation.rewrite.canceled_runs"] = fmt.Sprintf("%d", db.vlogGenerationRewriteCanceledRuns.Load()) stats["treedb.cache.vlog_generation.rewrite.canceled_runs.fresh_plan"] = fmt.Sprintf("%d", db.vlogGenerationRewriteCanceledFreshPlanRuns.Load()) stats["treedb.cache.vlog_generation.rewrite.canceled_runs.queued_debt"] = fmt.Sprintf("%d", db.vlogGenerationRewriteCanceledQueuedDebtRuns.Load()) diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go index ce83ba165..264fb8b92 100644 --- a/TreeDB/caching/vlog_generation_scheduler_test.go +++ b/TreeDB/caching/vlog_generation_scheduler_test.go @@ -5823,6 +5823,12 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) { db.vlogGenerationMaintenanceSkipStageDue.Store(2) db.vlogGenerationRewritePlanSelectedSegments.Store(6) db.vlogGenerationRewriteExecSourceSegments.Store(3) + db.vlogGenerationRewriteSourceSegmentsRequestedTotal.Store(5) + db.vlogGenerationRewriteSourceSegmentsStillReferencedTotal.Store(2) + db.vlogGenerationRewriteSourceSegmentsUnreferencedTotal.Store(3) + db.vlogGenerationRewriteSourceSegmentsRequestedLast.Store(2) + db.vlogGenerationRewriteSourceSegmentsStillReferencedLast.Store(1) + db.vlogGenerationRewriteSourceSegmentsUnreferencedLast.Store(1) db.vlogGenerationRewriteProcessedLiveBytes.Store(900) db.vlogGenerationRewriteProcessedStaleBytes.Store(450) db.vlogGenerationRewriteNoReclaimRuns.Store(3) @@ -5987,6 +5993,24 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) { if got := stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_total"]; got != "3" { t.Fatalf("rewrite exec source segments total=%q want 3", got) } + if got := stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_requested_total"]; got != "5" { + t.Fatalf("rewrite exec source segments requested total=%q want 5", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_still_referenced_total"]; got != "2" { + t.Fatalf("rewrite exec source segments still referenced total=%q want 2", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_unreferenced_total"]; got != "3" { + t.Fatalf("rewrite exec source segments unreferenced total=%q want 3", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_requested_last"]; got != "2" { + t.Fatalf("rewrite exec source segments requested last=%q want 2", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_still_referenced_last"]; got != "1" { + t.Fatalf("rewrite exec source segments still referenced last=%q want 1", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_unreferenced_last"]; got != "1" { + t.Fatalf("rewrite exec source segments unreferenced last=%q want 1", got) + } if got := stats["treedb.cache.vlog_generation.rewrite.processed_live_bytes"]; got != "900" { t.Fatalf("rewrite processed live bytes=%q want 900", got) } diff --git a/TreeDB/db/vlog_rewrite.go b/TreeDB/db/vlog_rewrite.go index efb73b610..ab5b5eb87 100644 --- a/TreeDB/db/vlog_rewrite.go +++ b/TreeDB/db/vlog_rewrite.go @@ -49,6 +49,15 @@ type ValueLogRewriteStats struct { BytesBefore int64 BytesAfter int64 RecordsCopied int + // SourceSegmentsRequested is the number of source segments selected for this + // rewrite run after applying selection filters. + SourceSegmentsRequested int + // SourceSegmentsStillReferenced is the subset of selected source segments + // that remained referenced after rewrite pointer swaps and cleanup. + SourceSegmentsStillReferenced int + // SourceSegmentsUnreferenced is the subset of selected source segments that + // became unreferenced after rewrite pointer swaps and cleanup. + SourceSegmentsUnreferenced int } // ValueLogRewritePlan summarizes which segments a sparse online rewrite would @@ -1180,6 +1189,7 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl } sourceIDs, _ = selectRewriteSourceSegmentsWithStats(opts, set.Files, active, liveByID) restrictSource = true + stats.SourceSegmentsRequested = len(sourceIDs) } _ = db.valueLogManager.Release(set) if restrictSource && len(sourceIDs) == 0 { @@ -1351,6 +1361,16 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl if err != nil { return stats, err } + if len(sourceIDs) > 0 { + stillReferenced := 0 + for id := range sourceIDs { + if _, ok := referencedAfter[id]; ok { + stillReferenced++ + } + } + stats.SourceSegmentsStillReferenced = stillReferenced + stats.SourceSegmentsUnreferenced = len(sourceIDs) - stillReferenced + } var protectedPaths map[string]struct{} allowActiveSkip := len(opts.ProtectedPaths) > 0 if allowActiveSkip { diff --git a/TreeDB/db/vlog_rewrite_test.go b/TreeDB/db/vlog_rewrite_test.go index 8528853ea..91127e148 100644 --- a/TreeDB/db/vlog_rewrite_test.go +++ b/TreeDB/db/vlog_rewrite_test.go @@ -2867,6 +2867,15 @@ func TestValueLogRewriteOnline_SourceFileIDsWithStaleFilterMatchesPlanSelection( if stats.RecordsCopied != 1 { t.Fatalf("expected one rewritten record from selected explicit source, got %d", stats.RecordsCopied) } + if stats.SourceSegmentsRequested != 1 { + t.Fatalf("source segments requested=%d want 1", stats.SourceSegmentsRequested) + } + if stats.SourceSegmentsStillReferenced != 0 { + t.Fatalf("source segments still referenced=%d want 0", stats.SourceSegmentsStillReferenced) + } + if stats.SourceSegmentsUnreferenced != 1 { + t.Fatalf("source segments unreferenced=%d want 1", stats.SourceSegmentsUnreferenced) + } ptrK1, flagsK1 := readProjectedPointerByKey(t, db, []byte("k1")) ptrK2, flagsK2 := readProjectedPointerByKey(t, db, []byte("k2")) diff --git a/TreeDB/vlog_rewrite.go b/TreeDB/vlog_rewrite.go index fd7879b87..5e60b37da 100644 --- a/TreeDB/vlog_rewrite.go +++ b/TreeDB/vlog_rewrite.go @@ -8,11 +8,14 @@ import ( // ValueLogRewriteStats summarizes value-log rewrite compaction results. type ValueLogRewriteStats struct { - SegmentsBefore int - SegmentsAfter int - BytesBefore int64 - BytesAfter int64 - RecordsCopied int + SegmentsBefore int + SegmentsAfter int + BytesBefore int64 + BytesAfter int64 + RecordsCopied int + SourceSegmentsRequested int + SourceSegmentsStillReferenced int + SourceSegmentsUnreferenced int } // ValueLogRewriteOnlineOptions controls online rewrite batching behavior. From e6f54aa17d5e6886bb5f26a40aa8d938d4ff17c8 Mon Sep 17 00:00:00 2001 From: Mikers Date: Sat, 28 Mar 2026 01:50:11 -1000 Subject: [PATCH 22/61] caching: probe rewrite source segments through gc protection buckets --- TreeDB/caching/db.go | 414 ++++++++++-------- .../caching/vlog_generation_scheduler_test.go | 88 ++++ TreeDB/db/vlog_gc.go | 134 ++++-- TreeDB/db/vlog_gc_test.go | 69 +++ 4 files changed, 508 insertions(+), 197 deletions(-) diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go index 1fb10acd8..89a9929c7 100644 --- a/TreeDB/caching/db.go +++ b/TreeDB/caching/db.go @@ -5356,177 +5356,199 @@ type DB struct { valueLogMaxSegmentBytes int64 journalCompression bool - disableJournal bool - relaxedSync bool - notifyError func(error) - debugFlushPointers bool - debugFlushTiming bool - debugPtrEligible atomic.Int64 - debugPtrUsed atomic.Int64 - debugPtrNoPtr atomic.Int64 - debugPtrDenied atomic.Int64 - debugPtrDisabled atomic.Int64 - checkpointRuns atomic.Uint64 - checkpointTotalNs atomic.Uint64 - checkpointMaxNs atomic.Uint64 - checkpointNoopSkips atomic.Uint64 - checkpointFlushMuWaitNs atomic.Uint64 - checkpointFlushMuWaitMaxNs atomic.Uint64 - checkpointAutoVacuumRuns atomic.Uint64 - checkpointAutoVacuumLastCheckRun atomic.Uint64 - checkpointAutoVacuumLastPages atomic.Uint64 - checkpointAutoVacuumLastInternalP50 atomic.Uint64 - checkpointAutoVacuumLastInternalAvg atomic.Uint64 - lastForegroundWriteUnixNano atomic.Int64 - lastForegroundReadUnixNano atomic.Int64 - foregroundReadStampCounter atomic.Uint32 - activeForegroundIterators atomic.Int64 - retainedPruneLastStartUnixNano atomic.Int64 - retainedValueLogPruneLastUnixNano atomic.Int64 - retainedValueLogPruneRuns atomic.Uint64 - retainedValueLogPruneForcedRuns atomic.Uint64 - retainedValueLogPruneForegroundAbortRuns atomic.Uint64 - retainedValueLogPruneRemovedSegments atomic.Uint64 - retainedValueLogPruneRemovedBytes atomic.Uint64 - retainedValueLogPruneInUseSkippedSegments atomic.Uint64 - retainedValueLogPruneInUseSkippedBytes atomic.Uint64 - retainedValueLogPruneCandidateSegments atomic.Uint64 - retainedValueLogPruneCandidateBytes atomic.Uint64 - retainedValueLogPruneLiveSkippedSegments atomic.Uint64 - retainedValueLogPruneLiveSkippedBytes atomic.Uint64 - retainedValueLogPruneParseSkippedSegments atomic.Uint64 - retainedValueLogPruneParseSkippedBytes atomic.Uint64 - retainedValueLogPruneZombieMarkedSegments atomic.Uint64 - retainedValueLogPruneZombieMarkedBytes atomic.Uint64 - retainedValueLogPruneScheduleRequests atomic.Uint64 - retainedValueLogPruneScheduleForcedRequests atomic.Uint64 - retainedValueLogPruneScheduleSkipClosing atomic.Uint64 - retainedValueLogPruneScheduleSkipInFlight atomic.Uint64 - retainedValueLogPruneScheduleSkipNoClosedBytes atomic.Uint64 - retainedValueLogPruneScheduleSkipBelowPressure atomic.Uint64 - retainedValueLogPruneScheduleSkipMinInterval atomic.Uint64 - retainedValueLogPruneWriteGateRetries atomic.Uint64 - retainedValueLogPruneWriteGateRetrySuccesses atomic.Uint64 - retainedPruneForceRequested atomic.Bool - retainedPruneMu sync.Mutex - retainedPruneDone chan struct{} - vlogGenerationRemapSuccesses atomic.Uint64 - vlogGenerationRemapFailures atomic.Uint64 - vlogGenerationRewriteBytesIn atomic.Uint64 - vlogGenerationRewriteBytesOut atomic.Uint64 - vlogGenerationRewriteReclaimedBytes atomic.Uint64 - vlogGenerationRewriteProcessedLiveBytes atomic.Uint64 - vlogGenerationRewriteProcessedStaleBytes atomic.Uint64 - vlogGenerationRewriteNoReclaimRuns atomic.Uint64 - vlogGenerationRewriteNoReclaimStaleBytes atomic.Uint64 - vlogGenerationRewriteRuns atomic.Uint64 - vlogGenerationRewritePlanRuns atomic.Uint64 - vlogGenerationRewritePlanCanceled atomic.Uint64 - vlogGenerationRewritePlanErrors atomic.Uint64 - vlogGenerationRewritePlanEmpty atomic.Uint64 - vlogGenerationRewritePlanEmptyAgeBlocked atomic.Uint64 - vlogGenerationRewritePlanEmptyNoSelection atomic.Uint64 - vlogGenerationRewritePlanSelected atomic.Uint64 - vlogGenerationRewritePlanSelectedSegments atomic.Uint64 - vlogGenerationRewritePlanSelectedBytes atomic.Uint64 - vlogGenerationRewritePlanSelectedLiveBytes atomic.Uint64 - vlogGenerationRewritePlanSelectedStaleBytes atomic.Uint64 - vlogGenerationRewritePlanPenaltyFilterRuns atomic.Uint64 - vlogGenerationRewritePlanPenaltyFilterSegments atomic.Uint64 - vlogGenerationRewritePlanPenaltyFilterToEmpty atomic.Uint64 - vlogGenerationRewritePlanCanceledLastNS atomic.Int64 - vlogGenerationRewriteAgeBlockedUntilNS atomic.Int64 - vlogGenerationRewriteAgeBlockedWakeRunning atomic.Bool - vlogGenerationRewriteIneffectiveLastNS atomic.Int64 - vlogGenerationRewriteIneffectiveRuns atomic.Uint64 - vlogGenerationRewriteIneffectiveBytesIn atomic.Uint64 - vlogGenerationRewriteIneffectiveBytesOut atomic.Uint64 - vlogGenerationRewriteCanceledRuns atomic.Uint64 - vlogGenerationRewriteCanceledFreshPlanRuns atomic.Uint64 - vlogGenerationRewriteCanceledQueuedDebtRuns atomic.Uint64 - vlogGenerationRewriteCanceledLastNS atomic.Int64 - vlogGenerationRewriteDeadlineRuns atomic.Uint64 - vlogGenerationRewriteDeadlineFreshPlanRuns atomic.Uint64 - vlogGenerationRewriteDeadlineQueuedDebtRuns atomic.Uint64 - vlogGenerationRewriteDeadlineLastNS atomic.Int64 - vlogGenerationRewriteQueuePruneRuns atomic.Uint64 - vlogGenerationRewriteQueuePruneIDs atomic.Uint64 - vlogGenerationGCSegmentsDeleted atomic.Uint64 - vlogGenerationGCBytesDeleted atomic.Uint64 - vlogGenerationGCRuns atomic.Uint64 - vlogGenerationVacuumRuns atomic.Uint64 - vlogGenerationVacuumFailures atomic.Uint64 - vlogGenerationVacuumSkippedDisabled atomic.Uint64 - vlogGenerationVacuumSkippedRewriteBytes atomic.Uint64 - vlogGenerationVacuumSkippedCooldown atomic.Uint64 - vlogGenerationLastVacuumUnixNano atomic.Int64 - vlogGenerationLastRewritePlanUnixNano atomic.Int64 - vlogGenerationLastRewriteUnixNano atomic.Int64 - vlogGenerationLastGCUnixNano atomic.Int64 - vlogGenerationLastCheckpointKickUnixNano atomic.Int64 - vlogGenerationLastGCDryRunUnixNano atomic.Int64 - vlogGenerationLastGCDryRunBytesEligible atomic.Int64 - vlogGenerationLastGCDryRunSegsEligible atomic.Int64 - vlogGenerationLastGCBytesReferenced atomic.Int64 - vlogGenerationLastGCSegmentsReferenced atomic.Int64 - vlogGenerationLastGCBytesActive atomic.Int64 - vlogGenerationLastGCSegmentsActive atomic.Int64 - vlogGenerationLastGCBytesProtected atomic.Int64 - vlogGenerationLastGCSegmentsProtected atomic.Int64 - vlogGenerationLastGCBytesProtectedInUse atomic.Int64 - vlogGenerationLastGCSegmentsProtectedInUse atomic.Int64 - vlogGenerationLastGCBytesProtectedRetained atomic.Int64 - vlogGenerationLastGCSegmentsProtectedRetained atomic.Int64 - vlogGenerationLastGCBytesProtectedOverlap atomic.Int64 - vlogGenerationLastGCSegmentsProtectedOverlap atomic.Int64 - vlogGenerationLastGCBytesProtectedOther atomic.Int64 - vlogGenerationLastGCSegmentsProtectedOther atomic.Int64 - vlogGenerationLastGCBytesEligible atomic.Int64 - vlogGenerationLastGCSegmentsEligible atomic.Int64 - vlogGenerationLastGCBytesDeleted atomic.Int64 - vlogGenerationLastGCSegmentsDeleted atomic.Int64 - vlogGenerationLastGCBytesPending atomic.Int64 - vlogGenerationLastGCSegmentsPending atomic.Int64 - vlogGenerationChurnBytes atomic.Uint64 - vlogGenerationSchedulerState atomic.Uint32 - vlogGenerationMaintenanceActive atomic.Bool - vlogGenerationMaintenanceAttempts atomic.Uint64 - vlogGenerationMaintenanceAcquired atomic.Uint64 - vlogGenerationMaintenanceCollisions atomic.Uint64 - vlogGenerationMaintenanceSkipWALOnPeriodic atomic.Uint64 - vlogGenerationMaintenanceSkipPhase atomic.Uint64 - vlogGenerationMaintenanceSkipStageGate atomic.Uint64 - vlogGenerationMaintenanceSkipStageNotDue atomic.Uint64 - vlogGenerationMaintenanceSkipStageDue atomic.Uint64 - vlogGenerationMaintenanceSkipAgeBlocked atomic.Uint64 - vlogGenerationMaintenanceSkipPriority atomic.Uint64 - vlogGenerationMaintenanceSkipQuiet atomic.Uint64 - vlogGenerationMaintenanceSkipPreCheckpoint atomic.Uint64 - vlogGenerationMaintenanceSkipCheckpointing atomic.Uint64 - vlogGenerationMaintenancePassNoop atomic.Uint64 - vlogGenerationMaintenancePassWithRewrite atomic.Uint64 - vlogGenerationMaintenancePassWithGC atomic.Uint64 - vlogGenerationMaintenancePassTotalNanos atomic.Uint64 - vlogGenerationMaintenancePassMaxNanos atomic.Uint64 - vlogGenerationLastReason atomic.Uint32 - vlogGenerationCheckpointKickRuns atomic.Uint64 - vlogGenerationCheckpointKickRewriteRuns atomic.Uint64 - vlogGenerationCheckpointKickGCRuns atomic.Uint64 - vlogGenerationCheckpointKickPending atomic.Bool - vlogGenerationDeferredMaintenancePending atomic.Bool - vlogGenerationDeferredMaintenanceRunning atomic.Bool - vlogGenerationRewriteStageWakeObservedNS atomic.Int64 - vlogGenerationRewriteQueueMu sync.Mutex - vlogGenerationCheckpointKickActive atomic.Bool - vlogGenerationRewriteQueue []uint32 - vlogGenerationRewriteLedger []backenddb.ValueLogRewritePlanSegment - vlogGenerationRewritePenalties map[uint32]valueLogGenerationRewritePenalty - vlogGenerationRewriteStagePending bool - vlogGenerationRewriteStageObservedUnixNano int64 - vlogGenerationRewriteQueueLoaded bool - vlogGenerationLastChurnBps atomic.Int64 - vlogGenerationLastChurnSampleBytes atomic.Uint64 - vlogGenerationLastChurnSampleNS atomic.Int64 + disableJournal bool + relaxedSync bool + notifyError func(error) + debugFlushPointers bool + debugFlushTiming bool + debugPtrEligible atomic.Int64 + debugPtrUsed atomic.Int64 + debugPtrNoPtr atomic.Int64 + debugPtrDenied atomic.Int64 + debugPtrDisabled atomic.Int64 + checkpointRuns atomic.Uint64 + checkpointTotalNs atomic.Uint64 + checkpointMaxNs atomic.Uint64 + checkpointNoopSkips atomic.Uint64 + checkpointFlushMuWaitNs atomic.Uint64 + checkpointFlushMuWaitMaxNs atomic.Uint64 + checkpointAutoVacuumRuns atomic.Uint64 + checkpointAutoVacuumLastCheckRun atomic.Uint64 + checkpointAutoVacuumLastPages atomic.Uint64 + checkpointAutoVacuumLastInternalP50 atomic.Uint64 + checkpointAutoVacuumLastInternalAvg atomic.Uint64 + lastForegroundWriteUnixNano atomic.Int64 + lastForegroundReadUnixNano atomic.Int64 + foregroundReadStampCounter atomic.Uint32 + activeForegroundIterators atomic.Int64 + retainedPruneLastStartUnixNano atomic.Int64 + retainedValueLogPruneLastUnixNano atomic.Int64 + retainedValueLogPruneRuns atomic.Uint64 + retainedValueLogPruneForcedRuns atomic.Uint64 + retainedValueLogPruneForegroundAbortRuns atomic.Uint64 + retainedValueLogPruneRemovedSegments atomic.Uint64 + retainedValueLogPruneRemovedBytes atomic.Uint64 + retainedValueLogPruneInUseSkippedSegments atomic.Uint64 + retainedValueLogPruneInUseSkippedBytes atomic.Uint64 + retainedValueLogPruneCandidateSegments atomic.Uint64 + retainedValueLogPruneCandidateBytes atomic.Uint64 + retainedValueLogPruneLiveSkippedSegments atomic.Uint64 + retainedValueLogPruneLiveSkippedBytes atomic.Uint64 + retainedValueLogPruneParseSkippedSegments atomic.Uint64 + retainedValueLogPruneParseSkippedBytes atomic.Uint64 + retainedValueLogPruneZombieMarkedSegments atomic.Uint64 + retainedValueLogPruneZombieMarkedBytes atomic.Uint64 + retainedValueLogPruneScheduleRequests atomic.Uint64 + retainedValueLogPruneScheduleForcedRequests atomic.Uint64 + retainedValueLogPruneScheduleSkipClosing atomic.Uint64 + retainedValueLogPruneScheduleSkipInFlight atomic.Uint64 + retainedValueLogPruneScheduleSkipNoClosedBytes atomic.Uint64 + retainedValueLogPruneScheduleSkipBelowPressure atomic.Uint64 + retainedValueLogPruneScheduleSkipMinInterval atomic.Uint64 + retainedValueLogPruneWriteGateRetries atomic.Uint64 + retainedValueLogPruneWriteGateRetrySuccesses atomic.Uint64 + retainedPruneForceRequested atomic.Bool + retainedPruneMu sync.Mutex + retainedPruneDone chan struct{} + vlogGenerationRemapSuccesses atomic.Uint64 + vlogGenerationRemapFailures atomic.Uint64 + vlogGenerationRewriteBytesIn atomic.Uint64 + vlogGenerationRewriteBytesOut atomic.Uint64 + vlogGenerationRewriteReclaimedBytes atomic.Uint64 + vlogGenerationRewriteProcessedLiveBytes atomic.Uint64 + vlogGenerationRewriteProcessedStaleBytes atomic.Uint64 + vlogGenerationRewriteNoReclaimRuns atomic.Uint64 + vlogGenerationRewriteNoReclaimStaleBytes atomic.Uint64 + vlogGenerationRewriteRuns atomic.Uint64 + vlogGenerationRewritePlanRuns atomic.Uint64 + vlogGenerationRewritePlanCanceled atomic.Uint64 + vlogGenerationRewritePlanErrors atomic.Uint64 + vlogGenerationRewritePlanEmpty atomic.Uint64 + vlogGenerationRewritePlanEmptyAgeBlocked atomic.Uint64 + vlogGenerationRewritePlanEmptyNoSelection atomic.Uint64 + vlogGenerationRewritePlanSelected atomic.Uint64 + vlogGenerationRewritePlanSelectedSegments atomic.Uint64 + vlogGenerationRewritePlanSelectedBytes atomic.Uint64 + vlogGenerationRewritePlanSelectedLiveBytes atomic.Uint64 + vlogGenerationRewritePlanSelectedStaleBytes atomic.Uint64 + vlogGenerationRewritePlanPenaltyFilterRuns atomic.Uint64 + vlogGenerationRewritePlanPenaltyFilterSegments atomic.Uint64 + vlogGenerationRewritePlanPenaltyFilterToEmpty atomic.Uint64 + vlogGenerationRewritePlanCanceledLastNS atomic.Int64 + vlogGenerationRewriteAgeBlockedUntilNS atomic.Int64 + vlogGenerationRewriteAgeBlockedWakeRunning atomic.Bool + vlogGenerationRewriteIneffectiveLastNS atomic.Int64 + vlogGenerationRewriteIneffectiveRuns atomic.Uint64 + vlogGenerationRewriteIneffectiveBytesIn atomic.Uint64 + vlogGenerationRewriteIneffectiveBytesOut atomic.Uint64 + vlogGenerationRewriteCanceledRuns atomic.Uint64 + vlogGenerationRewriteCanceledFreshPlanRuns atomic.Uint64 + vlogGenerationRewriteCanceledQueuedDebtRuns atomic.Uint64 + vlogGenerationRewriteCanceledLastNS atomic.Int64 + vlogGenerationRewriteDeadlineRuns atomic.Uint64 + vlogGenerationRewriteDeadlineFreshPlanRuns atomic.Uint64 + vlogGenerationRewriteDeadlineQueuedDebtRuns atomic.Uint64 + vlogGenerationRewriteDeadlineLastNS atomic.Int64 + vlogGenerationRewriteQueuePruneRuns atomic.Uint64 + vlogGenerationRewriteQueuePruneIDs atomic.Uint64 + vlogGenerationGCSegmentsDeleted atomic.Uint64 + vlogGenerationGCBytesDeleted atomic.Uint64 + vlogGenerationGCRuns atomic.Uint64 + vlogGenerationVacuumRuns atomic.Uint64 + vlogGenerationVacuumFailures atomic.Uint64 + vlogGenerationVacuumSkippedDisabled atomic.Uint64 + vlogGenerationVacuumSkippedRewriteBytes atomic.Uint64 + vlogGenerationVacuumSkippedCooldown atomic.Uint64 + vlogGenerationLastVacuumUnixNano atomic.Int64 + vlogGenerationLastRewritePlanUnixNano atomic.Int64 + vlogGenerationLastRewriteUnixNano atomic.Int64 + vlogGenerationLastGCUnixNano atomic.Int64 + vlogGenerationLastCheckpointKickUnixNano atomic.Int64 + vlogGenerationLastGCDryRunUnixNano atomic.Int64 + vlogGenerationLastGCDryRunBytesEligible atomic.Int64 + vlogGenerationLastGCDryRunSegsEligible atomic.Int64 + vlogGenerationLastGCBytesReferenced atomic.Int64 + vlogGenerationLastGCSegmentsReferenced atomic.Int64 + vlogGenerationLastGCBytesActive atomic.Int64 + vlogGenerationLastGCSegmentsActive atomic.Int64 + vlogGenerationLastGCBytesProtected atomic.Int64 + vlogGenerationLastGCSegmentsProtected atomic.Int64 + vlogGenerationLastGCBytesProtectedInUse atomic.Int64 + vlogGenerationLastGCSegmentsProtectedInUse atomic.Int64 + vlogGenerationLastGCBytesProtectedRetained atomic.Int64 + vlogGenerationLastGCSegmentsProtectedRetained atomic.Int64 + vlogGenerationLastGCBytesProtectedOverlap atomic.Int64 + vlogGenerationLastGCSegmentsProtectedOverlap atomic.Int64 + vlogGenerationLastGCBytesProtectedOther atomic.Int64 + vlogGenerationLastGCSegmentsProtectedOther atomic.Int64 + vlogGenerationLastGCBytesEligible atomic.Int64 + vlogGenerationLastGCSegmentsEligible atomic.Int64 + vlogGenerationLastGCBytesDeleted atomic.Int64 + vlogGenerationLastGCSegmentsDeleted atomic.Int64 + vlogGenerationLastGCBytesPending atomic.Int64 + vlogGenerationLastGCSegmentsPending atomic.Int64 + vlogGenerationLastGCObservedSourceSegments atomic.Int64 + vlogGenerationLastGCObservedSourceSegmentsReferenced atomic.Int64 + vlogGenerationLastGCObservedSourceSegmentsActive atomic.Int64 + vlogGenerationLastGCObservedSourceSegmentsProtected atomic.Int64 + vlogGenerationLastGCObservedSourceSegmentsProtectedInUse atomic.Int64 + vlogGenerationLastGCObservedSourceSegmentsProtectedRetained atomic.Int64 + vlogGenerationLastGCObservedSourceSegmentsProtectedOverlap atomic.Int64 + vlogGenerationLastGCObservedSourceSegmentsProtectedOther atomic.Int64 + vlogGenerationLastGCObservedSourceSegmentsEligible atomic.Int64 + vlogGenerationLastGCObservedSourceSegmentsDeleted atomic.Int64 + vlogGenerationLastGCObservedSourceSegmentsPending atomic.Int64 + vlogGenerationLastGCObservedSourceBytes atomic.Int64 + vlogGenerationLastGCObservedSourceBytesReferenced atomic.Int64 + vlogGenerationLastGCObservedSourceBytesActive atomic.Int64 + vlogGenerationLastGCObservedSourceBytesProtected atomic.Int64 + vlogGenerationLastGCObservedSourceBytesProtectedInUse atomic.Int64 + vlogGenerationLastGCObservedSourceBytesProtectedRetained atomic.Int64 + vlogGenerationLastGCObservedSourceBytesProtectedOverlap atomic.Int64 + vlogGenerationLastGCObservedSourceBytesProtectedOther atomic.Int64 + vlogGenerationLastGCObservedSourceBytesEligible atomic.Int64 + vlogGenerationLastGCObservedSourceBytesDeleted atomic.Int64 + vlogGenerationLastGCObservedSourceBytesPending atomic.Int64 + vlogGenerationChurnBytes atomic.Uint64 + vlogGenerationSchedulerState atomic.Uint32 + vlogGenerationMaintenanceActive atomic.Bool + vlogGenerationMaintenanceAttempts atomic.Uint64 + vlogGenerationMaintenanceAcquired atomic.Uint64 + vlogGenerationMaintenanceCollisions atomic.Uint64 + vlogGenerationMaintenanceSkipWALOnPeriodic atomic.Uint64 + vlogGenerationMaintenanceSkipPhase atomic.Uint64 + vlogGenerationMaintenanceSkipStageGate atomic.Uint64 + vlogGenerationMaintenanceSkipStageNotDue atomic.Uint64 + vlogGenerationMaintenanceSkipStageDue atomic.Uint64 + vlogGenerationMaintenanceSkipAgeBlocked atomic.Uint64 + vlogGenerationMaintenanceSkipPriority atomic.Uint64 + vlogGenerationMaintenanceSkipQuiet atomic.Uint64 + vlogGenerationMaintenanceSkipPreCheckpoint atomic.Uint64 + vlogGenerationMaintenanceSkipCheckpointing atomic.Uint64 + vlogGenerationMaintenancePassNoop atomic.Uint64 + vlogGenerationMaintenancePassWithRewrite atomic.Uint64 + vlogGenerationMaintenancePassWithGC atomic.Uint64 + vlogGenerationMaintenancePassTotalNanos atomic.Uint64 + vlogGenerationMaintenancePassMaxNanos atomic.Uint64 + vlogGenerationLastReason atomic.Uint32 + vlogGenerationCheckpointKickRuns atomic.Uint64 + vlogGenerationCheckpointKickRewriteRuns atomic.Uint64 + vlogGenerationCheckpointKickGCRuns atomic.Uint64 + vlogGenerationCheckpointKickPending atomic.Bool + vlogGenerationDeferredMaintenancePending atomic.Bool + vlogGenerationDeferredMaintenanceRunning atomic.Bool + vlogGenerationRewriteStageWakeObservedNS atomic.Int64 + vlogGenerationRewriteQueueMu sync.Mutex + vlogGenerationCheckpointKickActive atomic.Bool + vlogGenerationRewriteQueue []uint32 + vlogGenerationRewriteLedger []backenddb.ValueLogRewritePlanSegment + vlogGenerationRewritePenalties map[uint32]valueLogGenerationRewritePenalty + vlogGenerationRewriteStagePending bool + vlogGenerationRewriteStageObservedUnixNano int64 + vlogGenerationRewriteQueueLoaded bool + vlogGenerationLastChurnBps atomic.Int64 + vlogGenerationLastChurnSampleBytes atomic.Uint64 + vlogGenerationLastChurnSampleNS atomic.Int64 // Rewrite budget token bucket (bytes) for online maintenance. This lets us // interpret ValueLogRewriteBudgetBytesPerSec as a true per-second bandwidth // budget while still running maintenance at coarse intervals. @@ -12801,6 +12823,28 @@ func (db *DB) observeVlogGenerationGCStats(stats backenddb.ValueLogGCStats) { db.vlogGenerationLastGCSegmentsDeleted.Store(int64(stats.SegmentsDeleted)) db.vlogGenerationLastGCBytesPending.Store(stats.BytesPending) db.vlogGenerationLastGCSegmentsPending.Store(int64(stats.SegmentsPending)) + db.vlogGenerationLastGCObservedSourceSegments.Store(int64(stats.ObservedSourceSegments)) + db.vlogGenerationLastGCObservedSourceSegmentsReferenced.Store(int64(stats.ObservedSourceSegmentsReferenced)) + db.vlogGenerationLastGCObservedSourceSegmentsActive.Store(int64(stats.ObservedSourceSegmentsActive)) + db.vlogGenerationLastGCObservedSourceSegmentsProtected.Store(int64(stats.ObservedSourceSegmentsProtected)) + db.vlogGenerationLastGCObservedSourceSegmentsProtectedInUse.Store(int64(stats.ObservedSourceSegmentsProtectedInUse)) + db.vlogGenerationLastGCObservedSourceSegmentsProtectedRetained.Store(int64(stats.ObservedSourceSegmentsProtectedRetained)) + db.vlogGenerationLastGCObservedSourceSegmentsProtectedOverlap.Store(int64(stats.ObservedSourceSegmentsProtectedOverlap)) + db.vlogGenerationLastGCObservedSourceSegmentsProtectedOther.Store(int64(stats.ObservedSourceSegmentsProtectedOther)) + db.vlogGenerationLastGCObservedSourceSegmentsEligible.Store(int64(stats.ObservedSourceSegmentsEligible)) + db.vlogGenerationLastGCObservedSourceSegmentsDeleted.Store(int64(stats.ObservedSourceSegmentsDeleted)) + db.vlogGenerationLastGCObservedSourceSegmentsPending.Store(int64(stats.ObservedSourceSegmentsPending)) + db.vlogGenerationLastGCObservedSourceBytes.Store(stats.ObservedSourceBytes) + db.vlogGenerationLastGCObservedSourceBytesReferenced.Store(stats.ObservedSourceBytesReferenced) + db.vlogGenerationLastGCObservedSourceBytesActive.Store(stats.ObservedSourceBytesActive) + db.vlogGenerationLastGCObservedSourceBytesProtected.Store(stats.ObservedSourceBytesProtected) + db.vlogGenerationLastGCObservedSourceBytesProtectedInUse.Store(stats.ObservedSourceBytesProtectedInUse) + db.vlogGenerationLastGCObservedSourceBytesProtectedRetained.Store(stats.ObservedSourceBytesProtectedRetained) + db.vlogGenerationLastGCObservedSourceBytesProtectedOverlap.Store(stats.ObservedSourceBytesProtectedOverlap) + db.vlogGenerationLastGCObservedSourceBytesProtectedOther.Store(stats.ObservedSourceBytesProtectedOther) + db.vlogGenerationLastGCObservedSourceBytesEligible.Store(stats.ObservedSourceBytesEligible) + db.vlogGenerationLastGCObservedSourceBytesDeleted.Store(stats.ObservedSourceBytesDeleted) + db.vlogGenerationLastGCObservedSourceBytesPending.Store(stats.ObservedSourceBytesPending) } func (db *DB) observeVlogGenerationVacuumExecDuration(d time.Duration) { @@ -14292,7 +14336,11 @@ planned: if gcer, ok := db.backend.(backendValueLogGCer); ok { gcCtx, gcCancel := context.WithTimeout(context.Background(), 30*time.Second) gcStart := time.Now() - gcStats, gcErr := gcer.ValueLogGC(gcCtx, db.valueLogGCOptions(false)) + gcOpts := db.valueLogGCOptions(false) + if len(processedRewriteIDs) > 0 { + gcOpts.ObservedSourceFileIDs = append([]uint32(nil), processedRewriteIDs...) + } + gcStats, gcErr := gcer.ValueLogGC(gcCtx, gcOpts) gcCancel() gcDur := time.Since(gcStart) db.observeVlogGenerationGCExecDuration(gcDur) @@ -20315,6 +20363,28 @@ func (db *DB) Stats() map[string]string { stats["treedb.cache.vlog_generation.gc.last_deleted_bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCBytesDeleted.Load()) stats["treedb.cache.vlog_generation.gc.last_pending_segments"] = fmt.Sprintf("%d", db.vlogGenerationLastGCSegmentsPending.Load()) stats["treedb.cache.vlog_generation.gc.last_pending_bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCBytesPending.Load()) + stats["treedb.cache.vlog_generation.gc.last_observed_source.segments"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceSegments.Load()) + stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_referenced"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceSegmentsReferenced.Load()) + stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_active"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceSegmentsActive.Load()) + stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_protected"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceSegmentsProtected.Load()) + stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_protected_in_use"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceSegmentsProtectedInUse.Load()) + stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_protected_retained"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceSegmentsProtectedRetained.Load()) + stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_protected_overlap"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceSegmentsProtectedOverlap.Load()) + stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_protected_other"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceSegmentsProtectedOther.Load()) + stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_eligible"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceSegmentsEligible.Load()) + stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_deleted"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceSegmentsDeleted.Load()) + stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_pending"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceSegmentsPending.Load()) + stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceBytes.Load()) + stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_referenced"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceBytesReferenced.Load()) + stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_active"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceBytesActive.Load()) + stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_protected"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceBytesProtected.Load()) + stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_protected_in_use"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceBytesProtectedInUse.Load()) + stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_protected_retained"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceBytesProtectedRetained.Load()) + stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_protected_overlap"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceBytesProtectedOverlap.Load()) + stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_protected_other"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceBytesProtectedOther.Load()) + stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_eligible"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceBytesEligible.Load()) + stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_deleted"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceBytesDeleted.Load()) + stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_pending"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceBytesPending.Load()) stats["treedb.cache.vlog_generation.gc.runs"] = fmt.Sprintf("%d", db.vlogGenerationGCRuns.Load()) stats["treedb.cache.vlog_generation.gc.exec.total_ms"] = fmt.Sprintf("%.3f", float64(gcExecTotalNS)/float64(time.Millisecond)) stats["treedb.cache.vlog_generation.gc.exec.max_ms"] = fmt.Sprintf("%.3f", float64(gcExecMaxNS)/float64(time.Millisecond)) diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go index 264fb8b92..9f822397c 100644 --- a/TreeDB/caching/vlog_generation_scheduler_test.go +++ b/TreeDB/caching/vlog_generation_scheduler_test.go @@ -5819,6 +5819,28 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) { db.vlogGenerationLastGCBytesDeleted.Store(200) db.vlogGenerationLastGCSegmentsPending.Store(4) db.vlogGenerationLastGCBytesPending.Store(400) + db.vlogGenerationLastGCObservedSourceSegments.Store(2) + db.vlogGenerationLastGCObservedSourceSegmentsReferenced.Store(0) + db.vlogGenerationLastGCObservedSourceSegmentsActive.Store(0) + db.vlogGenerationLastGCObservedSourceSegmentsProtected.Store(2) + db.vlogGenerationLastGCObservedSourceSegmentsProtectedInUse.Store(0) + db.vlogGenerationLastGCObservedSourceSegmentsProtectedRetained.Store(2) + db.vlogGenerationLastGCObservedSourceSegmentsProtectedOverlap.Store(0) + db.vlogGenerationLastGCObservedSourceSegmentsProtectedOther.Store(0) + db.vlogGenerationLastGCObservedSourceSegmentsEligible.Store(0) + db.vlogGenerationLastGCObservedSourceSegmentsDeleted.Store(0) + db.vlogGenerationLastGCObservedSourceSegmentsPending.Store(0) + db.vlogGenerationLastGCObservedSourceBytes.Store(250) + db.vlogGenerationLastGCObservedSourceBytesReferenced.Store(0) + db.vlogGenerationLastGCObservedSourceBytesActive.Store(0) + db.vlogGenerationLastGCObservedSourceBytesProtected.Store(250) + db.vlogGenerationLastGCObservedSourceBytesProtectedInUse.Store(0) + db.vlogGenerationLastGCObservedSourceBytesProtectedRetained.Store(250) + db.vlogGenerationLastGCObservedSourceBytesProtectedOverlap.Store(0) + db.vlogGenerationLastGCObservedSourceBytesProtectedOther.Store(0) + db.vlogGenerationLastGCObservedSourceBytesEligible.Store(0) + db.vlogGenerationLastGCObservedSourceBytesDeleted.Store(0) + db.vlogGenerationLastGCObservedSourceBytesPending.Store(0) db.vlogGenerationMaintenanceSkipStageNotDue.Store(5) db.vlogGenerationMaintenanceSkipStageDue.Store(2) db.vlogGenerationRewritePlanSelectedSegments.Store(6) @@ -5936,6 +5958,72 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) { if got := stats["treedb.cache.vlog_generation.gc.last_pending_bytes"]; got != "400" { t.Fatalf("gc last pending bytes=%q want 400", got) } + if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.segments"]; got != "2" { + t.Fatalf("gc last observed source segments=%q want 2", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_referenced"]; got != "0" { + t.Fatalf("gc last observed source segments referenced=%q want 0", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_active"]; got != "0" { + t.Fatalf("gc last observed source segments active=%q want 0", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_protected"]; got != "2" { + t.Fatalf("gc last observed source segments protected=%q want 2", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_protected_in_use"]; got != "0" { + t.Fatalf("gc last observed source segments protected in-use=%q want 0", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_protected_retained"]; got != "2" { + t.Fatalf("gc last observed source segments protected retained=%q want 2", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_protected_overlap"]; got != "0" { + t.Fatalf("gc last observed source segments protected overlap=%q want 0", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_protected_other"]; got != "0" { + t.Fatalf("gc last observed source segments protected other=%q want 0", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_eligible"]; got != "0" { + t.Fatalf("gc last observed source segments eligible=%q want 0", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_deleted"]; got != "0" { + t.Fatalf("gc last observed source segments deleted=%q want 0", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_pending"]; got != "0" { + t.Fatalf("gc last observed source segments pending=%q want 0", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes"]; got != "250" { + t.Fatalf("gc last observed source bytes=%q want 250", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_referenced"]; got != "0" { + t.Fatalf("gc last observed source bytes referenced=%q want 0", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_active"]; got != "0" { + t.Fatalf("gc last observed source bytes active=%q want 0", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_protected"]; got != "250" { + t.Fatalf("gc last observed source bytes protected=%q want 250", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_protected_in_use"]; got != "0" { + t.Fatalf("gc last observed source bytes protected in-use=%q want 0", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_protected_retained"]; got != "250" { + t.Fatalf("gc last observed source bytes protected retained=%q want 250", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_protected_overlap"]; got != "0" { + t.Fatalf("gc last observed source bytes protected overlap=%q want 0", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_protected_other"]; got != "0" { + t.Fatalf("gc last observed source bytes protected other=%q want 0", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_eligible"]; got != "0" { + t.Fatalf("gc last observed source bytes eligible=%q want 0", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_deleted"]; got != "0" { + t.Fatalf("gc last observed source bytes deleted=%q want 0", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_pending"]; got != "0" { + t.Fatalf("gc last observed source bytes pending=%q want 0", got) + } if got := stats["treedb.cache.vlog_generation.vacuum.exec.total_ms"]; got != "44.000" { t.Fatalf("vacuum exec total ms=%q want 44.000", got) } diff --git a/TreeDB/db/vlog_gc.go b/TreeDB/db/vlog_gc.go index cbdc96e49..258c03b5a 100644 --- a/TreeDB/db/vlog_gc.go +++ b/TreeDB/db/vlog_gc.go @@ -27,32 +27,58 @@ type ValueLogGCOptions struct { ProtectedInUsePaths []string // ProtectedRetainedPaths are paths pinned by pointer lifecycle retention. ProtectedRetainedPaths []string + // ObservedSourceFileIDs enables per-classification probe counters for a + // caller-provided subset of segment IDs (for example, rewrite-selected + // source segments). IDs not present in the current set are ignored. + ObservedSourceFileIDs []uint32 } // ValueLogGCStats summarizes value-log GC work. type ValueLogGCStats struct { - SegmentsTotal int - SegmentsReferenced int - SegmentsActive int - SegmentsProtected int - SegmentsProtectedInUse int - SegmentsProtectedRetained int - SegmentsProtectedOverlap int - SegmentsProtectedOther int - SegmentsEligible int - SegmentsDeleted int - SegmentsPending int - BytesTotal int64 - BytesReferenced int64 - BytesActive int64 - BytesProtected int64 - BytesProtectedInUse int64 - BytesProtectedRetained int64 - BytesProtectedOverlap int64 - BytesProtectedOther int64 - BytesEligible int64 - BytesDeleted int64 - BytesPending int64 + SegmentsTotal int + SegmentsReferenced int + SegmentsActive int + SegmentsProtected int + SegmentsProtectedInUse int + SegmentsProtectedRetained int + SegmentsProtectedOverlap int + SegmentsProtectedOther int + SegmentsEligible int + SegmentsDeleted int + SegmentsPending int + BytesTotal int64 + BytesReferenced int64 + BytesActive int64 + BytesProtected int64 + BytesProtectedInUse int64 + BytesProtectedRetained int64 + BytesProtectedOverlap int64 + BytesProtectedOther int64 + BytesEligible int64 + BytesDeleted int64 + BytesPending int64 + ObservedSourceSegments int + ObservedSourceSegmentsReferenced int + ObservedSourceSegmentsActive int + ObservedSourceSegmentsProtected int + ObservedSourceSegmentsProtectedInUse int + ObservedSourceSegmentsProtectedRetained int + ObservedSourceSegmentsProtectedOverlap int + ObservedSourceSegmentsProtectedOther int + ObservedSourceSegmentsEligible int + ObservedSourceSegmentsDeleted int + ObservedSourceSegmentsPending int + ObservedSourceBytes int64 + ObservedSourceBytesReferenced int64 + ObservedSourceBytesActive int64 + ObservedSourceBytesProtected int64 + ObservedSourceBytesProtectedInUse int64 + ObservedSourceBytesProtectedRetained int64 + ObservedSourceBytesProtectedOverlap int64 + ObservedSourceBytesProtectedOther int64 + ObservedSourceBytesEligible int64 + ObservedSourceBytesDeleted int64 + ObservedSourceBytesPending int64 } // ValueLogGC deletes fully-unreferenced value-log segments. @@ -127,27 +153,49 @@ func (db *DB) ValueLogGC(ctx context.Context, opts ValueLogGCOptions) (ValueLogG protectedRetainedPaths[path] = struct{}{} } type candidate struct { - path string - size int64 + path string + size int64 + observed bool } candidates := make(map[uint32]candidate) + observedSourceIDs := make(map[uint32]struct{}, len(opts.ObservedSourceFileIDs)) + for _, id := range opts.ObservedSourceFileIDs { + if id == 0 { + continue + } + observedSourceIDs[id] = struct{}{} + } for id, f := range set.Files { if err := ctx.Err(); err != nil { return stats, err } size := fileSize(f) + observed := false + if _, ok := observedSourceIDs[id]; ok { + observed = true + stats.ObservedSourceSegments++ + stats.ObservedSourceBytes += size + } stats.SegmentsTotal++ stats.BytesTotal += size if _, ok := referenced[id]; ok { stats.SegmentsReferenced++ stats.BytesReferenced += size + if observed { + stats.ObservedSourceSegmentsReferenced++ + stats.ObservedSourceBytesReferenced += size + } continue } if _, ok := keptIDs[id]; ok { stats.SegmentsActive++ stats.BytesActive += size + if observed { + stats.ObservedSourceSegmentsActive++ + stats.ObservedSourceBytesActive += size + } continue } _, inUseProtected := protectedInUsePaths[f.Path] @@ -155,16 +203,32 @@ func (db *DB) ValueLogGC(ctx context.Context, opts ValueLogGCOptions) (ValueLogG if inUseProtected || retainedProtected { stats.SegmentsProtected++ stats.BytesProtected += size + if observed { + stats.ObservedSourceSegmentsProtected++ + stats.ObservedSourceBytesProtected += size + } switch { case inUseProtected && retainedProtected: stats.SegmentsProtectedOverlap++ stats.BytesProtectedOverlap += size + if observed { + stats.ObservedSourceSegmentsProtectedOverlap++ + stats.ObservedSourceBytesProtectedOverlap += size + } case inUseProtected: stats.SegmentsProtectedInUse++ stats.BytesProtectedInUse += size + if observed { + stats.ObservedSourceSegmentsProtectedInUse++ + stats.ObservedSourceBytesProtectedInUse += size + } default: stats.SegmentsProtectedRetained++ stats.BytesProtectedRetained += size + if observed { + stats.ObservedSourceSegmentsProtectedRetained++ + stats.ObservedSourceBytesProtectedRetained += size + } } continue } @@ -173,11 +237,21 @@ func (db *DB) ValueLogGC(ctx context.Context, opts ValueLogGCOptions) (ValueLogG stats.BytesProtected += size stats.SegmentsProtectedOther++ stats.BytesProtectedOther += size + if observed { + stats.ObservedSourceSegmentsProtected++ + stats.ObservedSourceBytesProtected += size + stats.ObservedSourceSegmentsProtectedOther++ + stats.ObservedSourceBytesProtectedOther += size + } continue } stats.SegmentsEligible++ stats.BytesEligible += size + if observed { + stats.ObservedSourceSegmentsEligible++ + stats.ObservedSourceBytesEligible += size + } if opts.DryRun { continue @@ -185,7 +259,7 @@ func (db *DB) ValueLogGC(ctx context.Context, opts ValueLogGCOptions) (ValueLogG if err := vm.MarkZombie(id); err != nil { return stats, err } - candidates[id] = candidate{path: f.Path, size: size} + candidates[id] = candidate{path: f.Path, size: size, observed: observed} } if opts.DryRun { @@ -212,6 +286,10 @@ func (db *DB) ValueLogGC(ctx context.Context, opts ValueLogGCOptions) (ValueLogG if os.IsNotExist(err) { stats.SegmentsDeleted++ stats.BytesDeleted += info.size + if info.observed { + stats.ObservedSourceSegmentsDeleted++ + stats.ObservedSourceBytesDeleted += info.size + } } else { return stats, err } @@ -223,6 +301,12 @@ func (db *DB) ValueLogGC(ctx context.Context, opts ValueLogGCOptions) (ValueLogG if stats.BytesEligible > stats.BytesDeleted { stats.BytesPending = stats.BytesEligible - stats.BytesDeleted } + if stats.ObservedSourceSegmentsEligible > stats.ObservedSourceSegmentsDeleted { + stats.ObservedSourceSegmentsPending = stats.ObservedSourceSegmentsEligible - stats.ObservedSourceSegmentsDeleted + } + if stats.ObservedSourceBytesEligible > stats.ObservedSourceBytesDeleted { + stats.ObservedSourceBytesPending = stats.ObservedSourceBytesEligible - stats.ObservedSourceBytesDeleted + } currentSet := vm.CurrentSetNoRefresh() if currentSet != nil { diff --git a/TreeDB/db/vlog_gc_test.go b/TreeDB/db/vlog_gc_test.go index 771f2b116..c6da6710c 100644 --- a/TreeDB/db/vlog_gc_test.go +++ b/TreeDB/db/vlog_gc_test.go @@ -199,11 +199,24 @@ func TestValueLogGC_ProtectedPathBreakdownStats(t *testing.T) { inUseOnlyPath := filepath.Join(dir, "wal", "value-l0-000001.log") retainedOnlyPath := filepath.Join(dir, "wal", "value-l0-000002.log") overlapPath := filepath.Join(dir, "wal", "value-l0-000003.log") + observedInUseID, err := valuelog.EncodeFileID(0, 1) + if err != nil { + t.Fatalf("observed in-use fileid: %v", err) + } + observedRetainedID, err := valuelog.EncodeFileID(0, 2) + if err != nil { + t.Fatalf("observed retained fileid: %v", err) + } + observedOverlapID, err := valuelog.EncodeFileID(0, 3) + if err != nil { + t.Fatalf("observed overlap fileid: %v", err) + } stats, err := db.ValueLogGC(context.Background(), ValueLogGCOptions{ DryRun: true, ProtectedInUsePaths: []string{inUseOnlyPath, overlapPath}, ProtectedRetainedPaths: []string{retainedOnlyPath, overlapPath}, + ObservedSourceFileIDs: []uint32{observedInUseID, observedRetainedID, observedOverlapID}, }) if err != nil { t.Fatalf("ValueLogGC: %v", err) @@ -245,6 +258,62 @@ func TestValueLogGC_ProtectedPathBreakdownStats(t *testing.T) { if stats.BytesProtectedOther != 0 { t.Fatalf("bytes protected other=%d want 0", stats.BytesProtectedOther) } + if stats.ObservedSourceSegments != 3 { + t.Fatalf("observed source segments=%d want 3", stats.ObservedSourceSegments) + } + if stats.ObservedSourceSegmentsReferenced != 0 { + t.Fatalf("observed source segments referenced=%d want 0", stats.ObservedSourceSegmentsReferenced) + } + if stats.ObservedSourceSegmentsActive != 0 { + t.Fatalf("observed source segments active=%d want 0", stats.ObservedSourceSegmentsActive) + } + if stats.ObservedSourceSegmentsProtected != 3 { + t.Fatalf("observed source segments protected=%d want 3", stats.ObservedSourceSegmentsProtected) + } + if stats.ObservedSourceSegmentsProtectedInUse != 1 { + t.Fatalf("observed source segments protected in-use=%d want 1", stats.ObservedSourceSegmentsProtectedInUse) + } + if stats.ObservedSourceSegmentsProtectedRetained != 1 { + t.Fatalf("observed source segments protected retained=%d want 1", stats.ObservedSourceSegmentsProtectedRetained) + } + if stats.ObservedSourceSegmentsProtectedOverlap != 1 { + t.Fatalf("observed source segments protected overlap=%d want 1", stats.ObservedSourceSegmentsProtectedOverlap) + } + if stats.ObservedSourceSegmentsProtectedOther != 0 { + t.Fatalf("observed source segments protected other=%d want 0", stats.ObservedSourceSegmentsProtectedOther) + } + if stats.ObservedSourceSegmentsEligible != 0 { + t.Fatalf("observed source segments eligible=%d want 0", stats.ObservedSourceSegmentsEligible) + } + if stats.ObservedSourceSegmentsDeleted != 0 { + t.Fatalf("observed source segments deleted=%d want 0", stats.ObservedSourceSegmentsDeleted) + } + if stats.ObservedSourceSegmentsPending != 0 { + t.Fatalf("observed source segments pending=%d want 0", stats.ObservedSourceSegmentsPending) + } + if stats.ObservedSourceBytes <= 0 { + t.Fatalf("observed source bytes=%d want >0", stats.ObservedSourceBytes) + } + if stats.ObservedSourceBytesProtected <= 0 { + t.Fatalf("observed source bytes protected=%d want >0", stats.ObservedSourceBytesProtected) + } + if stats.ObservedSourceBytesProtectedInUse <= 0 || + stats.ObservedSourceBytesProtectedRetained <= 0 || + stats.ObservedSourceBytesProtectedOverlap <= 0 { + t.Fatalf("expected non-zero observed source protected byte buckets, got %+v", stats) + } + if stats.ObservedSourceBytesProtectedOther != 0 { + t.Fatalf("observed source bytes protected other=%d want 0", stats.ObservedSourceBytesProtectedOther) + } + if stats.ObservedSourceBytesEligible != 0 { + t.Fatalf("observed source bytes eligible=%d want 0", stats.ObservedSourceBytesEligible) + } + if stats.ObservedSourceBytesDeleted != 0 { + t.Fatalf("observed source bytes deleted=%d want 0", stats.ObservedSourceBytesDeleted) + } + if stats.ObservedSourceBytesPending != 0 { + t.Fatalf("observed source bytes pending=%d want 0", stats.ObservedSourceBytesPending) + } } func TestValueLogGC_KeepsReferencedPointerSegments_WithOuterLeavesInValueLog(t *testing.T) { From 0813e223edda24ee0215f4b278a09007bf1c0de1 Mon Sep 17 00:00:00 2001 From: Mikers Date: Sat, 28 Mar 2026 03:18:48 -1000 Subject: [PATCH 23/61] caching: trace rewrite-observed retained prune outcomes --- TreeDB/caching/db.go | 451 ++++++++++++++---- .../caching/vlog_generation_scheduler_test.go | 120 ++++- 2 files changed, 468 insertions(+), 103 deletions(-) diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go index 89a9929c7..a6fcc3705 100644 --- a/TreeDB/caching/db.go +++ b/TreeDB/caching/db.go @@ -4194,21 +4194,100 @@ type valueLogSetRefresher interface { } type retainedValueLogPruneStats struct { - RemovedSegments int - RemovedBytes int64 - InUseSkippedSegments int - InUseSkippedBytes int64 - CandidateSegments int - CandidateBytes int64 - LiveSkippedSegments int - LiveSkippedBytes int64 - ParseSkippedSegments int - ParseSkippedBytes int64 - ZombieMarkedSegments int - ZombieMarkedBytes int64 - AbortedForegroundWrites bool - RetriedWithoutWriteGate bool - RetrySucceeded bool + RemovedSegments int + RemovedBytes int64 + InUseSkippedSegments int + InUseSkippedBytes int64 + CandidateSegments int + CandidateBytes int64 + LiveSkippedSegments int + LiveSkippedBytes int64 + ParseSkippedSegments int + ParseSkippedBytes int64 + ZombieMarkedSegments int + ZombieMarkedBytes int64 + ObservedSourceSegments int + ObservedSourceBytes int64 + ObservedSourceCandidateSegments int + ObservedSourceCandidateBytes int64 + ObservedSourceRemovedSegments int + ObservedSourceRemovedBytes int64 + ObservedSourceInUseSkippedSegments int + ObservedSourceInUseSkippedBytes int64 + ObservedSourceLiveSkippedSegments int + ObservedSourceLiveSkippedBytes int64 + ObservedSourceParseSkippedSegments int + ObservedSourceParseSkippedBytes int64 + ObservedSourceZombieMarkedSegments int + ObservedSourceZombieMarkedBytes int64 + AbortedForegroundWrites bool + RetriedWithoutWriteGate bool + RetrySucceeded bool +} + +func (db *DB) observeRetainedValueLogPruneStats(pruneStats retainedValueLogPruneStats) { + if db == nil { + return + } + db.retainedValueLogPruneLastObservedSourceSegments.Store(int64(pruneStats.ObservedSourceSegments)) + db.retainedValueLogPruneLastObservedSourceBytes.Store(pruneStats.ObservedSourceBytes) + db.retainedValueLogPruneLastObservedSourceCandidateSegments.Store(int64(pruneStats.ObservedSourceCandidateSegments)) + db.retainedValueLogPruneLastObservedSourceCandidateBytes.Store(pruneStats.ObservedSourceCandidateBytes) + db.retainedValueLogPruneLastObservedSourceRemovedSegments.Store(int64(pruneStats.ObservedSourceRemovedSegments)) + db.retainedValueLogPruneLastObservedSourceRemovedBytes.Store(pruneStats.ObservedSourceRemovedBytes) + db.retainedValueLogPruneLastObservedSourceInUseSkippedSegments.Store(int64(pruneStats.ObservedSourceInUseSkippedSegments)) + db.retainedValueLogPruneLastObservedSourceInUseSkippedBytes.Store(pruneStats.ObservedSourceInUseSkippedBytes) + db.retainedValueLogPruneLastObservedSourceLiveSkippedSegments.Store(int64(pruneStats.ObservedSourceLiveSkippedSegments)) + db.retainedValueLogPruneLastObservedSourceLiveSkippedBytes.Store(pruneStats.ObservedSourceLiveSkippedBytes) + db.retainedValueLogPruneLastObservedSourceParseSkippedSegments.Store(int64(pruneStats.ObservedSourceParseSkippedSegments)) + db.retainedValueLogPruneLastObservedSourceParseSkippedBytes.Store(pruneStats.ObservedSourceParseSkippedBytes) + db.retainedValueLogPruneLastObservedSourceZombieMarkedSegments.Store(int64(pruneStats.ObservedSourceZombieMarkedSegments)) + db.retainedValueLogPruneLastObservedSourceZombieMarkedBytes.Store(pruneStats.ObservedSourceZombieMarkedBytes) + if pruneStats.RetriedWithoutWriteGate { + db.retainedValueLogPruneWriteGateRetries.Add(1) + if pruneStats.RetrySucceeded { + db.retainedValueLogPruneWriteGateRetrySuccesses.Add(1) + } + } + if pruneStats.AbortedForegroundWrites { + db.retainedValueLogPruneForegroundAbortRuns.Add(1) + } + if pruneStats.RemovedSegments > 0 { + db.retainedValueLogPruneRemovedSegments.Add(uint64(pruneStats.RemovedSegments)) + } + if pruneStats.RemovedBytes > 0 { + db.retainedValueLogPruneRemovedBytes.Add(uint64(pruneStats.RemovedBytes)) + } + if pruneStats.InUseSkippedSegments > 0 { + db.retainedValueLogPruneInUseSkippedSegments.Add(uint64(pruneStats.InUseSkippedSegments)) + } + if pruneStats.InUseSkippedBytes > 0 { + db.retainedValueLogPruneInUseSkippedBytes.Add(uint64(pruneStats.InUseSkippedBytes)) + } + if pruneStats.CandidateSegments > 0 { + db.retainedValueLogPruneCandidateSegments.Add(uint64(pruneStats.CandidateSegments)) + } + if pruneStats.CandidateBytes > 0 { + db.retainedValueLogPruneCandidateBytes.Add(uint64(pruneStats.CandidateBytes)) + } + if pruneStats.LiveSkippedSegments > 0 { + db.retainedValueLogPruneLiveSkippedSegments.Add(uint64(pruneStats.LiveSkippedSegments)) + } + if pruneStats.LiveSkippedBytes > 0 { + db.retainedValueLogPruneLiveSkippedBytes.Add(uint64(pruneStats.LiveSkippedBytes)) + } + if pruneStats.ParseSkippedSegments > 0 { + db.retainedValueLogPruneParseSkippedSegments.Add(uint64(pruneStats.ParseSkippedSegments)) + } + if pruneStats.ParseSkippedBytes > 0 { + db.retainedValueLogPruneParseSkippedBytes.Add(uint64(pruneStats.ParseSkippedBytes)) + } + if pruneStats.ZombieMarkedSegments > 0 { + db.retainedValueLogPruneZombieMarkedSegments.Add(uint64(pruneStats.ZombieMarkedSegments)) + } + if pruneStats.ZombieMarkedBytes > 0 { + db.retainedValueLogPruneZombieMarkedBytes.Add(uint64(pruneStats.ZombieMarkedBytes)) + } } func (db *DB) valueLogClosedSegmentSize(path string) int64 { @@ -4229,6 +4308,10 @@ func (db *DB) valueLogClosedSegmentSize(path string) int64 { } func (db *DB) pruneRetainedValueLogs(force bool) retainedValueLogPruneStats { + return db.pruneRetainedValueLogsWithObserved(force, nil) +} + +func (db *DB) pruneRetainedValueLogsWithObserved(force bool, observedSourceIDs map[uint32]struct{}) retainedValueLogPruneStats { var out retainedValueLogPruneStats if !db.valueLogEnabled() { return out @@ -4244,17 +4327,40 @@ func (db *DB) pruneRetainedValueLogs(force bool) retainedValueLogPruneStats { } type pruneCandidate struct { - path string - size int64 + path string + size int64 + id uint32 + hasID bool + observed bool } candidatePaths := make([]pruneCandidate, 0, len(paths)) for _, path := range paths { size := db.valueLogClosedSegmentSize(path) + candidate := pruneCandidate{path: path, size: size} + if laneID, seq, valueLog, ok := parseLogSeq(filepath.Base(path)); ok && valueLog && laneID >= 0 { + if id, err := valuelog.EncodeFileID(uint32(laneID), uint32(seq)); err == nil { + candidate.id = id + candidate.hasID = true + if _, ok := observedSourceIDs[id]; ok { + candidate.observed = true + out.ObservedSourceSegments++ + if size > 0 { + out.ObservedSourceBytes += size + } + } + } + } if _, ok := inUse[path]; ok { out.InUseSkippedSegments++ if size > 0 { out.InUseSkippedBytes += size } + if candidate.observed { + out.ObservedSourceInUseSkippedSegments++ + if size > 0 { + out.ObservedSourceInUseSkippedBytes += size + } + } continue } if db.cleanupMissingRetainedValueLog(path) { @@ -4262,13 +4368,25 @@ func (db *DB) pruneRetainedValueLogs(force bool) retainedValueLogPruneStats { out.RemovedSegments++ out.RemovedBytes += size } + if candidate.observed { + out.ObservedSourceRemovedSegments++ + if size > 0 { + out.ObservedSourceRemovedBytes += size + } + } continue } out.CandidateSegments++ if size > 0 { out.CandidateBytes += size } - candidatePaths = append(candidatePaths, pruneCandidate{path: path, size: size}) + if candidate.observed { + out.ObservedSourceCandidateSegments++ + if size > 0 { + out.ObservedSourceCandidateBytes += size + } + } + candidatePaths = append(candidatePaths, candidate) } if len(candidatePaths) == 0 { return out @@ -4296,26 +4414,17 @@ func (db *DB) pruneRetainedValueLogs(force bool) retainedValueLogPruneStats { for _, candidate := range candidatePaths { path := candidate.path size := candidate.size - laneID, seq, valueLog, ok := parseLogSeq(filepath.Base(path)) - if !ok || !valueLog { + id := candidate.id + if !candidate.hasID { out.ParseSkippedSegments++ if size > 0 { out.ParseSkippedBytes += size } - continue - } - if laneID < 0 { - out.ParseSkippedSegments++ - if size > 0 { - out.ParseSkippedBytes += size - } - continue - } - id, err := valuelog.EncodeFileID(uint32(laneID), uint32(seq)) - if err != nil { - out.ParseSkippedSegments++ - if size > 0 { - out.ParseSkippedBytes += size + if candidate.observed { + out.ObservedSourceParseSkippedSegments++ + if size > 0 { + out.ObservedSourceParseSkippedBytes += size + } } continue } @@ -4324,6 +4433,12 @@ func (db *DB) pruneRetainedValueLogs(force bool) retainedValueLogPruneStats { if size > 0 { out.LiveSkippedBytes += size } + if candidate.observed { + out.ObservedSourceLiveSkippedSegments++ + if size > 0 { + out.ObservedSourceLiveSkippedBytes += size + } + } continue } @@ -4338,6 +4453,12 @@ func (db *DB) pruneRetainedValueLogs(force bool) retainedValueLogPruneStats { out.RemovedSegments++ out.RemovedBytes += size } + if candidate.observed { + out.ObservedSourceRemovedSegments++ + if size > 0 { + out.ObservedSourceRemovedBytes += size + } + } continue } if db.cleanupMissingRetainedValueLog(path) { @@ -4345,6 +4466,12 @@ func (db *DB) pruneRetainedValueLogs(force bool) retainedValueLogPruneStats { out.RemovedSegments++ out.RemovedBytes += size } + if candidate.observed { + out.ObservedSourceRemovedSegments++ + if size > 0 { + out.ObservedSourceRemovedBytes += size + } + } continue } db.reportError(fmt.Errorf("cachingdb: failed to mark value-log %d zombie: %w", id, err)) @@ -4354,6 +4481,12 @@ func (db *DB) pruneRetainedValueLogs(force bool) retainedValueLogPruneStats { if size > 0 { out.ZombieMarkedBytes += size } + if candidate.observed { + out.ObservedSourceZombieMarkedSegments++ + if size > 0 { + out.ObservedSourceZombieMarkedBytes += size + } + } marked = true } else { db.dropValueLogSegment(path) @@ -4366,6 +4499,12 @@ func (db *DB) pruneRetainedValueLogs(force bool) retainedValueLogPruneStats { out.RemovedSegments++ out.RemovedBytes += size } + if candidate.observed { + out.ObservedSourceRemovedSegments++ + if size > 0 { + out.ObservedSourceRemovedBytes += size + } + } } db.forgetValueLogRetain(path) } @@ -4468,6 +4607,38 @@ func (db *DB) waitForRetainedValueLogPruneQuietOrForce(quietWindow time.Duration } } +func (db *DB) queueRetainedPruneObservedSourceIDs(ids []uint32) { + if db == nil || len(ids) == 0 { + return + } + db.retainedPruneObservedMu.Lock() + if db.retainedPruneObservedSourceIDs == nil { + db.retainedPruneObservedSourceIDs = make(map[uint32]struct{}, len(ids)) + } + for _, id := range ids { + if id == 0 { + continue + } + db.retainedPruneObservedSourceIDs[id] = struct{}{} + } + db.retainedPruneObservedMu.Unlock() +} + +func (db *DB) takeRetainedPruneObservedSourceIDs() map[uint32]struct{} { + if db == nil { + return nil + } + db.retainedPruneObservedMu.Lock() + if len(db.retainedPruneObservedSourceIDs) == 0 { + db.retainedPruneObservedMu.Unlock() + return nil + } + out := db.retainedPruneObservedSourceIDs + db.retainedPruneObservedSourceIDs = nil + db.retainedPruneObservedMu.Unlock() + return out +} + func (db *DB) scheduleRetainedValueLogPrune() { db.scheduleRetainedValueLogPruneWithForce(false) } @@ -4545,51 +4716,13 @@ func (db *DB) scheduleRetainedValueLogPruneWithForce(force bool) { db.retainedValueLogPruneForcedRuns.Add(1) } db.retainedValueLogPruneLastUnixNano.Store(now.UnixNano()) - pruneStats := db.pruneRetainedValueLogs(effectiveForce) - if pruneStats.RetriedWithoutWriteGate { - db.retainedValueLogPruneWriteGateRetries.Add(1) - if pruneStats.RetrySucceeded { - db.retainedValueLogPruneWriteGateRetrySuccesses.Add(1) - } - } - if pruneStats.AbortedForegroundWrites { - db.retainedValueLogPruneForegroundAbortRuns.Add(1) - } - if pruneStats.RemovedSegments > 0 { - db.retainedValueLogPruneRemovedSegments.Add(uint64(pruneStats.RemovedSegments)) - } - if pruneStats.RemovedBytes > 0 { - db.retainedValueLogPruneRemovedBytes.Add(uint64(pruneStats.RemovedBytes)) - } - if pruneStats.InUseSkippedSegments > 0 { - db.retainedValueLogPruneInUseSkippedSegments.Add(uint64(pruneStats.InUseSkippedSegments)) - } - if pruneStats.InUseSkippedBytes > 0 { - db.retainedValueLogPruneInUseSkippedBytes.Add(uint64(pruneStats.InUseSkippedBytes)) - } - if pruneStats.CandidateSegments > 0 { - db.retainedValueLogPruneCandidateSegments.Add(uint64(pruneStats.CandidateSegments)) - } - if pruneStats.CandidateBytes > 0 { - db.retainedValueLogPruneCandidateBytes.Add(uint64(pruneStats.CandidateBytes)) - } - if pruneStats.LiveSkippedSegments > 0 { - db.retainedValueLogPruneLiveSkippedSegments.Add(uint64(pruneStats.LiveSkippedSegments)) - } - if pruneStats.LiveSkippedBytes > 0 { - db.retainedValueLogPruneLiveSkippedBytes.Add(uint64(pruneStats.LiveSkippedBytes)) - } - if pruneStats.ParseSkippedSegments > 0 { - db.retainedValueLogPruneParseSkippedSegments.Add(uint64(pruneStats.ParseSkippedSegments)) - } - if pruneStats.ParseSkippedBytes > 0 { - db.retainedValueLogPruneParseSkippedBytes.Add(uint64(pruneStats.ParseSkippedBytes)) - } - if pruneStats.ZombieMarkedSegments > 0 { - db.retainedValueLogPruneZombieMarkedSegments.Add(uint64(pruneStats.ZombieMarkedSegments)) - } - if pruneStats.ZombieMarkedBytes > 0 { - db.retainedValueLogPruneZombieMarkedBytes.Add(uint64(pruneStats.ZombieMarkedBytes)) + observedSourceIDs := db.takeRetainedPruneObservedSourceIDs() + pruneStats := db.pruneRetainedValueLogsWithObserved(effectiveForce, observedSourceIDs) + db.observeRetainedValueLogPruneStats(pruneStats) + if len(observedSourceIDs) > 0 && (pruneStats.ObservedSourceZombieMarkedSegments > 0 || pruneStats.ObservedSourceRemovedSegments > 0) { + // When a retained prune processes rewrite-observed source segments, + // queue a near-term maintenance pass so GC can re-check reclaim state. + db.vlogGenerationCheckpointKickPending.Store(true) } }() } @@ -5398,6 +5531,20 @@ type DB struct { retainedValueLogPruneParseSkippedBytes atomic.Uint64 retainedValueLogPruneZombieMarkedSegments atomic.Uint64 retainedValueLogPruneZombieMarkedBytes atomic.Uint64 + retainedValueLogPruneLastObservedSourceSegments atomic.Int64 + retainedValueLogPruneLastObservedSourceBytes atomic.Int64 + retainedValueLogPruneLastObservedSourceCandidateSegments atomic.Int64 + retainedValueLogPruneLastObservedSourceCandidateBytes atomic.Int64 + retainedValueLogPruneLastObservedSourceRemovedSegments atomic.Int64 + retainedValueLogPruneLastObservedSourceRemovedBytes atomic.Int64 + retainedValueLogPruneLastObservedSourceInUseSkippedSegments atomic.Int64 + retainedValueLogPruneLastObservedSourceInUseSkippedBytes atomic.Int64 + retainedValueLogPruneLastObservedSourceLiveSkippedSegments atomic.Int64 + retainedValueLogPruneLastObservedSourceLiveSkippedBytes atomic.Int64 + retainedValueLogPruneLastObservedSourceParseSkippedSegments atomic.Int64 + retainedValueLogPruneLastObservedSourceParseSkippedBytes atomic.Int64 + retainedValueLogPruneLastObservedSourceZombieMarkedSegments atomic.Int64 + retainedValueLogPruneLastObservedSourceZombieMarkedBytes atomic.Int64 retainedValueLogPruneScheduleRequests atomic.Uint64 retainedValueLogPruneScheduleForcedRequests atomic.Uint64 retainedValueLogPruneScheduleSkipClosing atomic.Uint64 @@ -5408,6 +5555,8 @@ type DB struct { retainedValueLogPruneWriteGateRetries atomic.Uint64 retainedValueLogPruneWriteGateRetrySuccesses atomic.Uint64 retainedPruneForceRequested atomic.Bool + retainedPruneObservedMu sync.Mutex + retainedPruneObservedSourceIDs map[uint32]struct{} retainedPruneMu sync.Mutex retainedPruneDone chan struct{} vlogGenerationRemapSuccesses atomic.Uint64 @@ -14334,27 +14483,121 @@ planned: } } if gcer, ok := db.backend.(backendValueLogGCer); ok { - gcCtx, gcCancel := context.WithTimeout(context.Background(), 30*time.Second) - gcStart := time.Now() gcOpts := db.valueLogGCOptions(false) if len(processedRewriteIDs) > 0 { gcOpts.ObservedSourceFileIDs = append([]uint32(nil), processedRewriteIDs...) } - gcStats, gcErr := gcer.ValueLogGC(gcCtx, gcOpts) - gcCancel() - gcDur := time.Since(gcStart) - db.observeVlogGenerationGCExecDuration(gcDur) + runGC := func(phase string) (backenddb.ValueLogGCStats, error) { + gcCtx, gcCancel := context.WithTimeout(context.Background(), 30*time.Second) + gcStart := time.Now() + gcStats, gcErr := gcer.ValueLogGC(gcCtx, gcOpts) + gcCancel() + gcDur := time.Since(gcStart) + db.observeVlogGenerationGCExecDuration(gcDur) + if gcErr != nil { + db.debugVlogMaintf( + "gc_after_rewrite_err reason=%s phase=%s err=%v dur_ms=%.3f", + vlogGenerationReasonString(reason), + phase, + gcErr, + float64(gcDur.Microseconds())/1000, + ) + return backenddb.ValueLogGCStats{}, gcErr + } + db.observeVlogGenerationGCStats(gcStats) + db.vlogGenerationGCRuns.Add(1) + if gcStats.SegmentsDeleted > 0 { + db.vlogGenerationGCSegmentsDeleted.Add(uint64(gcStats.SegmentsDeleted)) + } + if gcStats.BytesDeleted > 0 { + db.vlogGenerationGCBytesDeleted.Add(uint64(gcStats.BytesDeleted)) + gcBytesDeleted += int64(gcStats.BytesDeleted) + effectiveBytesAfter -= int64(gcStats.BytesDeleted) + if effectiveBytesAfter < 0 { + effectiveBytesAfter = 0 + } + } + db.debugVlogMaintf( + "gc_after_rewrite_done reason=%s phase=%s dur_ms=%.3f", + vlogGenerationReasonString(reason), + phase, + float64(gcDur.Microseconds())/1000, + ) + return gcStats, nil + } + + gcStats, gcErr := runGC("initial") if gcErr != nil { - db.debugVlogMaintf("gc_after_rewrite_err reason=%s err=%v dur_ms=%.3f", vlogGenerationReasonString(reason), gcErr, float64(gcDur.Microseconds())/1000) return fmt.Errorf("generational gc after rewrite: %w", gcErr) } - db.observeVlogGenerationGCStats(gcStats) - db.vlogGenerationGCRuns.Add(1) - if gcStats.SegmentsDeleted > 0 { - db.vlogGenerationGCSegmentsDeleted.Add(uint64(gcStats.SegmentsDeleted)) - } - if gcStats.BytesDeleted > 0 { - db.vlogGenerationGCBytesDeleted.Add(uint64(gcStats.BytesDeleted)) + + rewriteBlockedByRetained := len(processedRewriteIDs) > 0 && + gcStats.ObservedSourceSegments > 0 && + gcStats.ObservedSourceSegmentsReferenced == 0 && + gcStats.ObservedSourceSegmentsEligible == 0 && + gcStats.ObservedSourceSegmentsProtectedRetained > 0 + if rewriteBlockedByRetained { + if db.retainedPruneActive() { + db.queueRetainedPruneObservedSourceIDs(processedRewriteIDs) + // A prune is already in flight. Ensure a follow-up attempt stays queued. + db.scheduleRetainedValueLogPruneForce() + // Request a follow-up maintenance pass so GC can re-evaluate + // rewrite-observed source segments after the in-flight prune completes. + db.vlogGenerationCheckpointKickPending.Store(true) + } else { + observedSourceIDSet := make(map[uint32]struct{}, len(processedRewriteIDs)) + for _, id := range processedRewriteIDs { + if id == 0 { + continue + } + observedSourceIDSet[id] = struct{}{} + } + nowPrune := time.Now() + db.retainedPruneLastStartUnixNano.Store(nowPrune.UnixNano()) + db.retainedValueLogPruneRuns.Add(1) + db.retainedValueLogPruneForcedRuns.Add(1) + db.retainedValueLogPruneLastUnixNano.Store(nowPrune.UnixNano()) + pruneStats := db.pruneRetainedValueLogsWithObserved(true, observedSourceIDSet) + db.observeRetainedValueLogPruneStats(pruneStats) + db.debugVlogMaintf( + "rewrite_retained_prune reason=%s observed_source_retained_segments=%d observed_source_retained_bytes=%d observed_source_seen_segments=%d observed_source_seen_bytes=%d observed_source_candidate_segments=%d observed_source_candidate_bytes=%d observed_source_removed_segments=%d observed_source_removed_bytes=%d observed_source_zombie_marked_segments=%d observed_source_zombie_marked_bytes=%d observed_source_live_skipped_segments=%d observed_source_live_skipped_bytes=%d observed_source_in_use_skipped_segments=%d observed_source_in_use_skipped_bytes=%d observed_source_parse_skipped_segments=%d observed_source_parse_skipped_bytes=%d removed_segments=%d removed_bytes=%d zombie_marked_segments=%d zombie_marked_bytes=%d live_skipped_segments=%d live_skipped_bytes=%d aborted=%t", + vlogGenerationReasonString(reason), + gcStats.ObservedSourceSegmentsProtectedRetained, + gcStats.ObservedSourceBytesProtectedRetained, + pruneStats.ObservedSourceSegments, + pruneStats.ObservedSourceBytes, + pruneStats.ObservedSourceCandidateSegments, + pruneStats.ObservedSourceCandidateBytes, + pruneStats.ObservedSourceRemovedSegments, + pruneStats.ObservedSourceRemovedBytes, + pruneStats.ObservedSourceZombieMarkedSegments, + pruneStats.ObservedSourceZombieMarkedBytes, + pruneStats.ObservedSourceLiveSkippedSegments, + pruneStats.ObservedSourceLiveSkippedBytes, + pruneStats.ObservedSourceInUseSkippedSegments, + pruneStats.ObservedSourceInUseSkippedBytes, + pruneStats.ObservedSourceParseSkippedSegments, + pruneStats.ObservedSourceParseSkippedBytes, + pruneStats.RemovedSegments, + pruneStats.RemovedBytes, + pruneStats.ZombieMarkedSegments, + pruneStats.ZombieMarkedBytes, + pruneStats.LiveSkippedSegments, + pruneStats.LiveSkippedBytes, + pruneStats.AbortedForegroundWrites, + ) + // Refresh protected path sets after inline retained prune so + // the follow-up GC pass evaluates updated retention state. + gcOpts = db.valueLogGCOptions(false) + if len(processedRewriteIDs) > 0 { + gcOpts.ObservedSourceFileIDs = append([]uint32(nil), processedRewriteIDs...) + } + gcStatsAfterPrune, gcErr := runGC("post_retained_prune") + if gcErr != nil { + return fmt.Errorf("generational gc after retained prune: %w", gcErr) + } + gcStats = gcStatsAfterPrune + } } if gcStats.BytesProtectedRetained > 0 && gcStats.BytesEligible == 0 && db.valueLogRetainedClosedBytes.Load() > 0 { // Retained-path protection can starve live reclaim even when rewrite @@ -14362,14 +14605,6 @@ planned: // lifecycle pins can drain without waiting for byte-pressure gates. db.scheduleRetainedValueLogPruneForce() } - if gcStats.BytesDeleted > 0 { - gcBytesDeleted = int64(gcStats.BytesDeleted) - effectiveBytesAfter -= gcBytesDeleted - if effectiveBytesAfter < 0 { - effectiveBytesAfter = 0 - } - } - db.debugVlogMaintf("gc_after_rewrite_done reason=%s dur_ms=%.3f", vlogGenerationReasonString(reason), float64(gcDur.Microseconds())/1000) } if effectiveBytesBefore > effectiveBytesAfter { db.vlogGenerationRewriteReclaimedBytes.Add(uint64(effectiveBytesBefore - effectiveBytesAfter)) @@ -20187,6 +20422,20 @@ func (db *DB) Stats() map[string]string { stats["treedb.cache.vlog_retained_prune.parse_skipped_bytes"] = fmt.Sprintf("%d", db.retainedValueLogPruneParseSkippedBytes.Load()) stats["treedb.cache.vlog_retained_prune.zombie_marked_segments"] = fmt.Sprintf("%d", db.retainedValueLogPruneZombieMarkedSegments.Load()) stats["treedb.cache.vlog_retained_prune.zombie_marked_bytes"] = fmt.Sprintf("%d", db.retainedValueLogPruneZombieMarkedBytes.Load()) + stats["treedb.cache.vlog_retained_prune.last_observed_source.segments"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceSegments.Load()) + stats["treedb.cache.vlog_retained_prune.last_observed_source.bytes"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceBytes.Load()) + stats["treedb.cache.vlog_retained_prune.last_observed_source.segments_candidate"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceCandidateSegments.Load()) + stats["treedb.cache.vlog_retained_prune.last_observed_source.bytes_candidate"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceCandidateBytes.Load()) + stats["treedb.cache.vlog_retained_prune.last_observed_source.segments_removed"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceRemovedSegments.Load()) + stats["treedb.cache.vlog_retained_prune.last_observed_source.bytes_removed"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceRemovedBytes.Load()) + stats["treedb.cache.vlog_retained_prune.last_observed_source.segments_in_use_skipped"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceInUseSkippedSegments.Load()) + stats["treedb.cache.vlog_retained_prune.last_observed_source.bytes_in_use_skipped"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceInUseSkippedBytes.Load()) + stats["treedb.cache.vlog_retained_prune.last_observed_source.segments_live_skipped"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceLiveSkippedSegments.Load()) + stats["treedb.cache.vlog_retained_prune.last_observed_source.bytes_live_skipped"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceLiveSkippedBytes.Load()) + stats["treedb.cache.vlog_retained_prune.last_observed_source.segments_parse_skipped"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceParseSkippedSegments.Load()) + stats["treedb.cache.vlog_retained_prune.last_observed_source.bytes_parse_skipped"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceParseSkippedBytes.Load()) + stats["treedb.cache.vlog_retained_prune.last_observed_source.segments_zombie_marked"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceZombieMarkedSegments.Load()) + stats["treedb.cache.vlog_retained_prune.last_observed_source.bytes_zombie_marked"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceZombieMarkedBytes.Load()) stats["treedb.cache.vlog_retained_prune.pressure_bytes"] = fmt.Sprintf("%d", db.retainedPrunePressureBytes()) stats["treedb.cache.vlog_retained_prune.schedule_requests"] = fmt.Sprintf("%d", db.retainedValueLogPruneScheduleRequests.Load()) stats["treedb.cache.vlog_retained_prune.schedule_forced_requests"] = fmt.Sprintf("%d", db.retainedValueLogPruneScheduleForcedRequests.Load()) diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go index 9f822397c..98d41c068 100644 --- a/TreeDB/caching/vlog_generation_scheduler_test.go +++ b/TreeDB/caching/vlog_generation_scheduler_test.go @@ -736,7 +736,9 @@ type rewriteBudgetRecordingBackend struct { rewriteResponse backenddb.ValueLogRewriteStats rewriteErr error gcCalls int + gcOpts []backenddb.ValueLogGCOptions gcResponse backenddb.ValueLogGCStats + gcResponses []backenddb.ValueLogGCStats gcErr error } @@ -766,7 +768,18 @@ func (b *rewriteBudgetRecordingBackend) ValueLogRewriteOnline(ctx context.Contex func (b *rewriteBudgetRecordingBackend) ValueLogGC(ctx context.Context, opts backenddb.ValueLogGCOptions) (backenddb.ValueLogGCStats, error) { b.mu.Lock() b.gcCalls++ + b.gcOpts = append(b.gcOpts, cloneGCOptsForTest(opts)) stats := b.gcResponse + if len(b.gcResponses) > 0 { + idx := b.gcCalls - 1 + if idx < 0 { + idx = 0 + } + if idx >= len(b.gcResponses) { + idx = len(b.gcResponses) - 1 + } + stats = b.gcResponses[idx] + } err := b.gcErr b.mu.Unlock() return stats, err @@ -785,6 +798,15 @@ func cloneRewriteOptsForTest(opts backenddb.ValueLogRewriteOnlineOptions) backen return cloned } +func cloneGCOptsForTest(opts backenddb.ValueLogGCOptions) backenddb.ValueLogGCOptions { + cloned := opts + cloned.ProtectedPaths = append([]string(nil), opts.ProtectedPaths...) + cloned.ProtectedInUsePaths = append([]string(nil), opts.ProtectedInUsePaths...) + cloned.ProtectedRetainedPaths = append([]string(nil), opts.ProtectedRetainedPaths...) + cloned.ObservedSourceFileIDs = append([]uint32(nil), opts.ObservedSourceFileIDs...) + return cloned +} + func (b *rewriteBudgetRecordingBackend) recordedPlan() (backenddb.ValueLogRewriteOnlineOptions, int) { b.mu.Lock() defer b.mu.Unlock() @@ -794,7 +816,31 @@ func (b *rewriteBudgetRecordingBackend) recordedPlan() (backenddb.ValueLogRewrit func (b *rewriteBudgetRecordingBackend) recordedGC() (backenddb.ValueLogGCStats, int) { b.mu.Lock() defer b.mu.Unlock() - return b.gcResponse, b.gcCalls + stats := b.gcResponse + if len(b.gcResponses) > 0 && b.gcCalls > 0 { + idx := b.gcCalls - 1 + if idx >= len(b.gcResponses) { + idx = len(b.gcResponses) - 1 + } + stats = b.gcResponses[idx] + } + return stats, b.gcCalls +} + +func (b *rewriteBudgetRecordingBackend) recordedGCObservedSourceCalls() int { + b.mu.Lock() + defer b.mu.Unlock() + count := 0 + for _, opts := range b.gcOpts { + if opts.DryRun { + continue + } + if len(opts.ObservedSourceFileIDs) == 0 { + continue + } + count++ + } + return count } func openRewriteQueueTestDB(t *testing.T, dir string, recorder *rewriteBudgetRecordingBackend) (*DB, func()) { @@ -899,7 +945,7 @@ func TestVlogGenerationMaintenance_SerializesConcurrentRuns(t *testing.T) { // While the first pass is still inside rewrite, a concurrent pass should be // skipped by the maintenance-active gate instead of issuing a second rewrite. - db.maybeRunVlogGenerationMaintenanceWithOptions(true, vlogGenerationMaintenanceOptions{ + db.maybeRunVlogGenerationMaintenanceWithOptions(false, vlogGenerationMaintenanceOptions{ bypassQuiet: true, skipRetainedPruneWait: true, skipCheckpoint: true, @@ -1018,6 +1064,76 @@ func TestVlogGenerationRewrite_QueuedExecIgnoresForegroundCancelUntilBoundedComp } } +func TestVlogGenerationRewrite_ObservedSourceRetainedBlock_RunsSecondGC(t *testing.T) { + prepareDirectSchedulerTest(t) + + dir := t.TempDir() + backend, err := backenddb.Open(backenddb.Options{Dir: dir}) + if err != nil { + t.Fatalf("open backend: %v", err) + } + recorder := &rewriteBudgetRecordingBackend{ + DB: backend, + rewriteResponse: backenddb.ValueLogRewriteStats{ + BytesBefore: 128, + BytesAfter: 128, + RecordsCopied: 1, + SourceSegmentsRequested: 1, + SourceSegmentsStillReferenced: 0, + SourceSegmentsUnreferenced: 1, + }, + gcResponses: []backenddb.ValueLogGCStats{ + { + BytesProtectedRetained: 64, + BytesEligible: 0, + ObservedSourceSegments: 1, + ObservedSourceSegmentsReferenced: 0, + ObservedSourceSegmentsEligible: 0, + ObservedSourceSegmentsProtectedRetained: 1, + ObservedSourceBytesProtectedRetained: 64, + }, + { + BytesProtectedRetained: 0, + BytesEligible: 64, + BytesDeleted: 64, + ObservedSourceSegments: 1, + ObservedSourceSegmentsEligible: 1, + ObservedSourceSegmentsDeleted: 1, + ObservedSourceBytesEligible: 64, + ObservedSourceBytesDeleted: 64, + }, + }, + } + + db, cleanup := openRewriteQueueTestDB(t, dir, recorder) + defer cleanup() + skipRetainedPrune(db) + + if err := db.setVlogGenerationRewriteQueue([]uint32{11}); err != nil { + t.Fatalf("seed rewrite queue: %v", err) + } + db.vlogGenerationRewriteBudgetTokensBytes.Store(1024) + forceVlogMaintenanceIdle(db) + forceRetainedPruneIdle(db) + + db.maybeRunVlogGenerationMaintenanceWithOptions(true, vlogGenerationMaintenanceOptions{ + bypassQuiet: true, + skipRetainedPruneWait: true, + skipCheckpoint: true, + rewriteDebtDrain: true, + }) + + if got := recorder.recordedGCObservedSourceCalls(); got != 2 { + t.Fatalf("observed-source gc calls=%d want 2 when observed source is retained-blocked", got) + } + if got := db.vlogGenerationLastGCObservedSourceSegmentsEligible.Load(); got != 1 { + t.Fatalf("last observed source eligible segments=%d want 1 after second gc", got) + } + if got := db.vlogGenerationLastGCObservedSourceBytesDeleted.Load(); got != 64 { + t.Fatalf("last observed source deleted bytes=%d want 64 after second gc", got) + } +} + func TestVlogGenerationRewrite_FreshPlanExecIgnoresForegroundCancelUntilBoundedComplete(t *testing.T) { prepareDirectSchedulerTest(t) From d0898f78f35d88afb5879fdb4178ca1ef37cf2a3 Mon Sep 17 00:00:00 2001 From: Mikers Date: Sat, 28 Mar 2026 03:57:17 -1000 Subject: [PATCH 24/61] caching: replay observed-source gc after retained prune --- TreeDB/caching/db.go | 103 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 100 insertions(+), 3 deletions(-) diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go index a6fcc3705..d1907120b 100644 --- a/TreeDB/caching/db.go +++ b/TreeDB/caching/db.go @@ -4639,6 +4639,61 @@ func (db *DB) takeRetainedPruneObservedSourceIDs() map[uint32]struct{} { return out } +func (db *DB) queueVlogGenerationObservedSourceGCList(ids []uint32) { + if db == nil || len(ids) == 0 { + return + } + db.vlogGenerationObservedGCMu.Lock() + if db.vlogGenerationObservedGCSourceIDs == nil { + db.vlogGenerationObservedGCSourceIDs = make(map[uint32]struct{}, len(ids)) + } + for _, id := range ids { + if id == 0 { + continue + } + db.vlogGenerationObservedGCSourceIDs[id] = struct{}{} + } + db.vlogGenerationObservedGCMu.Unlock() +} + +func (db *DB) queueVlogGenerationObservedSourceGCIDs(ids map[uint32]struct{}) { + if db == nil || len(ids) == 0 { + return + } + db.vlogGenerationObservedGCMu.Lock() + if db.vlogGenerationObservedGCSourceIDs == nil { + db.vlogGenerationObservedGCSourceIDs = make(map[uint32]struct{}, len(ids)) + } + for id := range ids { + if id == 0 { + continue + } + db.vlogGenerationObservedGCSourceIDs[id] = struct{}{} + } + db.vlogGenerationObservedGCMu.Unlock() +} + +func (db *DB) takeVlogGenerationObservedSourceGCList() []uint32 { + if db == nil { + return nil + } + db.vlogGenerationObservedGCMu.Lock() + if len(db.vlogGenerationObservedGCSourceIDs) == 0 { + db.vlogGenerationObservedGCMu.Unlock() + return nil + } + out := make([]uint32, 0, len(db.vlogGenerationObservedGCSourceIDs)) + for id := range db.vlogGenerationObservedGCSourceIDs { + if id == 0 { + continue + } + out = append(out, id) + } + db.vlogGenerationObservedGCSourceIDs = nil + db.vlogGenerationObservedGCMu.Unlock() + return out +} + func (db *DB) scheduleRetainedValueLogPrune() { db.scheduleRetainedValueLogPruneWithForce(false) } @@ -4722,6 +4777,7 @@ func (db *DB) scheduleRetainedValueLogPruneWithForce(force bool) { if len(observedSourceIDs) > 0 && (pruneStats.ObservedSourceZombieMarkedSegments > 0 || pruneStats.ObservedSourceRemovedSegments > 0) { // When a retained prune processes rewrite-observed source segments, // queue a near-term maintenance pass so GC can re-check reclaim state. + db.queueVlogGenerationObservedSourceGCIDs(observedSourceIDs) db.vlogGenerationCheckpointKickPending.Store(true) } }() @@ -5557,6 +5613,8 @@ type DB struct { retainedPruneForceRequested atomic.Bool retainedPruneObservedMu sync.Mutex retainedPruneObservedSourceIDs map[uint32]struct{} + vlogGenerationObservedGCMu sync.Mutex + vlogGenerationObservedGCSourceIDs map[uint32]struct{} retainedPruneMu sync.Mutex retainedPruneDone chan struct{} vlogGenerationRemapSuccesses atomic.Uint64 @@ -14599,10 +14657,23 @@ planned: gcStats = gcStatsAfterPrune } } + if len(processedRewriteIDs) > 0 && + gcStats.ObservedSourceSegments > 0 && + gcStats.ObservedSourceSegmentsProtectedRetained > 0 && + gcStats.ObservedSourceSegmentsEligible == 0 { + // Rewrite-selected source segments remained retained-protected + // after in-pass prune/GC. Queue an observed-source replay GC for + // the next maintenance pass. + db.queueVlogGenerationObservedSourceGCList(processedRewriteIDs) + db.vlogGenerationCheckpointKickPending.Store(true) + } if gcStats.BytesProtectedRetained > 0 && gcStats.BytesEligible == 0 && db.valueLogRetainedClosedBytes.Load() > 0 { // Retained-path protection can starve live reclaim even when rewrite // processed stale payload in-pass. Kick an eager retained prune so // lifecycle pins can drain without waiting for byte-pressure gates. + if len(processedRewriteIDs) > 0 { + db.queueRetainedPruneObservedSourceIDs(processedRewriteIDs) + } db.scheduleRetainedValueLogPruneForce() } } @@ -14780,26 +14851,34 @@ planned: return } + observedSourceGCIDs := db.takeVlogGenerationObservedSourceGCList() + forceObservedSourceGC := len(observedSourceGCIDs) > 0 if envBool(envDisableVlogGenerationGC) { + if forceObservedSourceGC { + db.queueVlogGenerationObservedSourceGCList(observedSourceGCIDs) + } return } // GC is a best-effort background maintenance task. It requires a checkpoint // barrier to be safe, and that barrier can be very expensive during sustained // ingest/restore when the flush queue is non-empty. Avoid introducing long // stalls by only running the GC path when the cached write queue is drained. - if queueLen != 0 { + if queueLen != 0 && !forceObservedSourceGC { return } gcer, ok := db.backend.(backendValueLogGCer) if !ok { + if forceObservedSourceGC { + db.queueVlogGenerationObservedSourceGCList(observedSourceGCIDs) + } return } - needEligibilityEstimate := !runGC && !db.shouldRunVlogGenerationGC(retained, reclaimable, churnBps) + needEligibilityEstimate := !runGC && !forceObservedSourceGC && !db.shouldRunVlogGenerationGC(retained, reclaimable, churnBps) now = time.Now() lastGC := db.vlogGenerationLastGCUnixNano.Load() if lastGC > 0 { lastAt := time.Unix(0, lastGC) - if now.Sub(lastAt) < vlogGenerationGCMinInterval { + if !forceObservedSourceGC && now.Sub(lastAt) < vlogGenerationGCMinInterval { return } } @@ -14822,6 +14901,9 @@ planned: db.vlogGenerationLastGCUnixNano.Store(now.UnixNano()) ctx, cancel := db.foregroundMaintenanceContext(30 * time.Second) gcOpts := db.valueLogGCOptions(false) + if forceObservedSourceGC { + gcOpts.ObservedSourceFileIDs = append([]uint32(nil), observedSourceGCIDs...) + } gcStart := time.Now() gcStats, err := gcer.ValueLogGC(ctx, gcOpts) cancel() @@ -14833,8 +14915,20 @@ planned: if gcStats.BytesProtectedRetained > 0 && gcStats.BytesEligible == 0 && db.valueLogRetainedClosedBytes.Load() > 0 { // When GC classifies all reclaim blockers as retained-path protection, // trigger an eager retained prune pass to release stale lifecycle pins. + if forceObservedSourceGC { + db.queueRetainedPruneObservedSourceIDs(observedSourceGCIDs) + } db.scheduleRetainedValueLogPruneForce() } + if forceObservedSourceGC && + gcStats.ObservedSourceSegments > 0 && + gcStats.ObservedSourceSegmentsProtectedRetained > 0 && + gcStats.ObservedSourceSegmentsEligible == 0 { + db.queueRetainedPruneObservedSourceIDs(observedSourceGCIDs) + db.scheduleRetainedValueLogPruneForce() + db.queueVlogGenerationObservedSourceGCList(observedSourceGCIDs) + db.vlogGenerationCheckpointKickPending.Store(true) + } db.vlogGenerationSchedulerState.Store(vlogGenerationSchedulerIdle) db.vlogGenerationGCRuns.Add(1) if gcStats.SegmentsDeleted > 0 { @@ -14846,6 +14940,9 @@ planned: return nil }) if err != nil { + if forceObservedSourceGC { + db.queueVlogGenerationObservedSourceGCList(observedSourceGCIDs) + } if errors.Is(err, context.Canceled) { db.vlogGenerationSchedulerState.Store(vlogGenerationSchedulerIdle) return From 60c1639deee7e67b6fe14a6008b505d09bb1d01b Mon Sep 17 00:00:00 2001 From: Mikers Date: Sat, 28 Mar 2026 04:13:48 -1000 Subject: [PATCH 25/61] caching: instrument observed-source replay gc queue --- TreeDB/caching/db.go | 43 ++++++++++ .../caching/vlog_generation_scheduler_test.go | 80 +++++++++++++++++++ 2 files changed, 123 insertions(+) diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go index d1907120b..bb407ff5f 100644 --- a/TreeDB/caching/db.go +++ b/TreeDB/caching/db.go @@ -4647,13 +4647,22 @@ func (db *DB) queueVlogGenerationObservedSourceGCList(ids []uint32) { if db.vlogGenerationObservedGCSourceIDs == nil { db.vlogGenerationObservedGCSourceIDs = make(map[uint32]struct{}, len(ids)) } + added := 0 for _, id := range ids { if id == 0 { continue } + if _, exists := db.vlogGenerationObservedGCSourceIDs[id]; exists { + continue + } db.vlogGenerationObservedGCSourceIDs[id] = struct{}{} + added++ } db.vlogGenerationObservedGCMu.Unlock() + if added > 0 { + db.vlogGenerationObservedGCQueuedBatches.Add(1) + db.vlogGenerationObservedGCQueuedIDs.Add(uint64(added)) + } } func (db *DB) queueVlogGenerationObservedSourceGCIDs(ids map[uint32]struct{}) { @@ -4664,13 +4673,22 @@ func (db *DB) queueVlogGenerationObservedSourceGCIDs(ids map[uint32]struct{}) { if db.vlogGenerationObservedGCSourceIDs == nil { db.vlogGenerationObservedGCSourceIDs = make(map[uint32]struct{}, len(ids)) } + added := 0 for id := range ids { if id == 0 { continue } + if _, exists := db.vlogGenerationObservedGCSourceIDs[id]; exists { + continue + } db.vlogGenerationObservedGCSourceIDs[id] = struct{}{} + added++ } db.vlogGenerationObservedGCMu.Unlock() + if added > 0 { + db.vlogGenerationObservedGCQueuedBatches.Add(1) + db.vlogGenerationObservedGCQueuedIDs.Add(uint64(added)) + } } func (db *DB) takeVlogGenerationObservedSourceGCList() []uint32 { @@ -4691,6 +4709,10 @@ func (db *DB) takeVlogGenerationObservedSourceGCList() []uint32 { } db.vlogGenerationObservedGCSourceIDs = nil db.vlogGenerationObservedGCMu.Unlock() + if len(out) > 0 { + db.vlogGenerationObservedGCTakenBatches.Add(1) + db.vlogGenerationObservedGCTakenIDs.Add(uint64(len(out))) + } return out } @@ -5615,6 +5637,12 @@ type DB struct { retainedPruneObservedSourceIDs map[uint32]struct{} vlogGenerationObservedGCMu sync.Mutex vlogGenerationObservedGCSourceIDs map[uint32]struct{} + vlogGenerationObservedGCQueuedBatches atomic.Uint64 + vlogGenerationObservedGCQueuedIDs atomic.Uint64 + vlogGenerationObservedGCTakenBatches atomic.Uint64 + vlogGenerationObservedGCTakenIDs atomic.Uint64 + vlogGenerationObservedGCRuns atomic.Uint64 + vlogGenerationObservedGCRetryQueued atomic.Uint64 retainedPruneMu sync.Mutex retainedPruneDone chan struct{} vlogGenerationRemapSuccesses atomic.Uint64 @@ -14855,6 +14883,7 @@ planned: forceObservedSourceGC := len(observedSourceGCIDs) > 0 if envBool(envDisableVlogGenerationGC) { if forceObservedSourceGC { + db.vlogGenerationObservedGCRetryQueued.Add(1) db.queueVlogGenerationObservedSourceGCList(observedSourceGCIDs) } return @@ -14869,6 +14898,7 @@ planned: gcer, ok := db.backend.(backendValueLogGCer) if !ok { if forceObservedSourceGC { + db.vlogGenerationObservedGCRetryQueued.Add(1) db.queueVlogGenerationObservedSourceGCList(observedSourceGCIDs) } return @@ -14903,6 +14933,7 @@ planned: gcOpts := db.valueLogGCOptions(false) if forceObservedSourceGC { gcOpts.ObservedSourceFileIDs = append([]uint32(nil), observedSourceGCIDs...) + db.vlogGenerationObservedGCRuns.Add(1) } gcStart := time.Now() gcStats, err := gcer.ValueLogGC(ctx, gcOpts) @@ -14926,6 +14957,7 @@ planned: gcStats.ObservedSourceSegmentsEligible == 0 { db.queueRetainedPruneObservedSourceIDs(observedSourceGCIDs) db.scheduleRetainedValueLogPruneForce() + db.vlogGenerationObservedGCRetryQueued.Add(1) db.queueVlogGenerationObservedSourceGCList(observedSourceGCIDs) db.vlogGenerationCheckpointKickPending.Store(true) } @@ -14941,6 +14973,7 @@ planned: }) if err != nil { if forceObservedSourceGC { + db.vlogGenerationObservedGCRetryQueued.Add(1) db.queueVlogGenerationObservedSourceGCList(observedSourceGCIDs) } if errors.Is(err, context.Canceled) { @@ -20462,6 +20495,9 @@ func (db *DB) Stats() map[string]string { } } db.vlogGenerationRewriteQueueMu.Unlock() + db.vlogGenerationObservedGCMu.Lock() + observedGCPending := len(db.vlogGenerationObservedGCSourceIDs) + db.vlogGenerationObservedGCMu.Unlock() rewriteAgeBlockedUntilNS := db.vlogGenerationRewriteAgeBlockedUntilNS.Load() rewriteAgeBlockedRemainingMS := int64(0) if rewriteAgeBlockedUntilNS > 0 { @@ -20648,6 +20684,13 @@ func (db *DB) Stats() map[string]string { stats["treedb.cache.vlog_generation.rewrite.plan_penalty_filter.runs"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanPenaltyFilterRuns.Load()) stats["treedb.cache.vlog_generation.rewrite.plan_penalty_filter.segments"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanPenaltyFilterSegments.Load()) stats["treedb.cache.vlog_generation.rewrite.plan_penalty_filter.to_empty_runs"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanPenaltyFilterToEmpty.Load()) + stats["treedb.cache.vlog_generation.observed_gc.pending_ids"] = fmt.Sprintf("%d", observedGCPending) + stats["treedb.cache.vlog_generation.observed_gc.queued_batches"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCQueuedBatches.Load()) + stats["treedb.cache.vlog_generation.observed_gc.queued_ids"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCQueuedIDs.Load()) + stats["treedb.cache.vlog_generation.observed_gc.taken_batches"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCTakenBatches.Load()) + stats["treedb.cache.vlog_generation.observed_gc.taken_ids"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCTakenIDs.Load()) + stats["treedb.cache.vlog_generation.observed_gc.runs"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCRuns.Load()) + stats["treedb.cache.vlog_generation.observed_gc.retry_queued"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCRetryQueued.Load()) stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_total"] = fmt.Sprintf("%d", db.vlogGenerationRewriteExecSourceSegments.Load()) stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_requested_total"] = fmt.Sprintf("%d", db.vlogGenerationRewriteSourceSegmentsRequestedTotal.Load()) stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_still_referenced_total"] = fmt.Sprintf("%d", db.vlogGenerationRewriteSourceSegmentsStillReferencedTotal.Load()) diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go index 98d41c068..c05292311 100644 --- a/TreeDB/caching/vlog_generation_scheduler_test.go +++ b/TreeDB/caching/vlog_generation_scheduler_test.go @@ -6,6 +6,7 @@ import ( "fmt" "os" "path/filepath" + "sort" "sync" "testing" "time" @@ -1134,6 +1135,52 @@ func TestVlogGenerationRewrite_ObservedSourceRetainedBlock_RunsSecondGC(t *testi } } +func TestVlogGenerationObservedSourceGCQueue_CountersAndDedupe(t *testing.T) { + db := &DB{} + + db.queueVlogGenerationObservedSourceGCList([]uint32{7, 9, 7, 0}) + db.queueVlogGenerationObservedSourceGCIDs(map[uint32]struct{}{ + 0: {}, + 9: {}, + 12: {}, + }) + + if got := db.vlogGenerationObservedGCQueuedBatches.Load(); got != 2 { + t.Fatalf("queued batches=%d want 2", got) + } + if got := db.vlogGenerationObservedGCQueuedIDs.Load(); got != 3 { + t.Fatalf("queued ids=%d want 3", got) + } + + ids := db.takeVlogGenerationObservedSourceGCList() + sort.Slice(ids, func(i, j int) bool { return ids[i] < ids[j] }) + want := []uint32{7, 9, 12} + if len(ids) != len(want) { + t.Fatalf("taken ids len=%d want %d (%v)", len(ids), len(want), ids) + } + for i := range ids { + if ids[i] != want[i] { + t.Fatalf("taken ids[%d]=%d want %d (all=%v)", i, ids[i], want[i], ids) + } + } + + if got := db.vlogGenerationObservedGCTakenBatches.Load(); got != 1 { + t.Fatalf("taken batches=%d want 1", got) + } + if got := db.vlogGenerationObservedGCTakenIDs.Load(); got != uint64(len(want)) { + t.Fatalf("taken ids=%d want %d", got, len(want)) + } + + // Empty take should not mutate taken counters. + _ = db.takeVlogGenerationObservedSourceGCList() + if got := db.vlogGenerationObservedGCTakenBatches.Load(); got != 1 { + t.Fatalf("taken batches after empty take=%d want 1", got) + } + if got := db.vlogGenerationObservedGCTakenIDs.Load(); got != uint64(len(want)) { + t.Fatalf("taken ids after empty take=%d want %d", got, len(want)) + } +} + func TestVlogGenerationRewrite_FreshPlanExecIgnoresForegroundCancelUntilBoundedComplete(t *testing.T) { prepareDirectSchedulerTest(t) @@ -5971,6 +6018,12 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) { db.vlogGenerationRewriteProcessedStaleBytes.Store(450) db.vlogGenerationRewriteNoReclaimRuns.Store(3) db.vlogGenerationRewriteNoReclaimStaleBytes.Store(320) + db.vlogGenerationObservedGCQueuedBatches.Store(5) + db.vlogGenerationObservedGCQueuedIDs.Store(12) + db.vlogGenerationObservedGCTakenBatches.Store(4) + db.vlogGenerationObservedGCTakenIDs.Store(9) + db.vlogGenerationObservedGCRuns.Store(3) + db.vlogGenerationObservedGCRetryQueued.Store(2) db.vlogGenerationRewriteQueueMu.Lock() db.vlogGenerationRewriteQueueLoaded = true @@ -5985,6 +6038,12 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) { db.vlogGenerationRewriteStagePending = true db.vlogGenerationRewriteStageObservedUnixNano = 1234 db.vlogGenerationRewriteQueueMu.Unlock() + db.vlogGenerationObservedGCMu.Lock() + db.vlogGenerationObservedGCSourceIDs = map[uint32]struct{}{ + 101: {}, + 102: {}, + } + db.vlogGenerationObservedGCMu.Unlock() stats := db.Stats() if got := stats["treedb.cache.vlog_generation.maintenance.pass.total_ms"]; got != "40.000" { @@ -6227,4 +6286,25 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) { if got := stats["treedb.cache.vlog_generation.rewrite.no_reclaim_stale_bytes"]; got != "320" { t.Fatalf("rewrite no reclaim stale bytes=%q want 320", got) } + if got := stats["treedb.cache.vlog_generation.observed_gc.pending_ids"]; got != "2" { + t.Fatalf("observed gc pending ids=%q want 2", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.queued_batches"]; got != "5" { + t.Fatalf("observed gc queued batches=%q want 5", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.queued_ids"]; got != "12" { + t.Fatalf("observed gc queued ids=%q want 12", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.taken_batches"]; got != "4" { + t.Fatalf("observed gc taken batches=%q want 4", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.taken_ids"]; got != "9" { + t.Fatalf("observed gc taken ids=%q want 9", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.runs"]; got != "3" { + t.Fatalf("observed gc runs=%q want 3", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.retry_queued"]; got != "2" { + t.Fatalf("observed gc retry queued=%q want 2", got) + } } From c07a887e3655033b528f0e195537fec0ddb54ff0 Mon Sep 17 00:00:00 2001 From: Mikers Date: Sat, 28 Mar 2026 04:45:16 -1000 Subject: [PATCH 26/61] caching: keep bypass-quiet gc alive under foreground resume --- TreeDB/caching/db.go | 79 ++++++++++++++++++- .../caching/vlog_generation_scheduler_test.go | 65 +++++++++++++++ 2 files changed, 142 insertions(+), 2 deletions(-) diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go index bb407ff5f..287f08e17 100644 --- a/TreeDB/caching/db.go +++ b/TreeDB/caching/db.go @@ -14516,7 +14516,7 @@ planned: if len(processedRewriteIDs) > 0 { ctx, cancel = context.WithTimeout(context.Background(), vlogGenerationRewriteBoundedExecTimeout) } else { - ctx, cancel = db.foregroundMaintenanceContext(2 * time.Minute) + ctx, cancel = db.vlogGenerationMaintenanceContext(2*time.Minute, opts) } db.debugVlogMaintf( "rewrite_exec reason=%s source_ids=%d max_segments=%d budget_tokens=%d max_source_bytes=%d min_stale_ratio=%.6f queue_len=%d ledger_live_bytes=%d", @@ -14882,6 +14882,12 @@ planned: observedSourceGCIDs := db.takeVlogGenerationObservedSourceGCList() forceObservedSourceGC := len(observedSourceGCIDs) > 0 if envBool(envDisableVlogGenerationGC) { + db.debugVlogMaintf( + "gc_skip reason=disabled_env run_gc=%t force_observed=%t observed_ids=%d", + runGC, + forceObservedSourceGC, + len(observedSourceGCIDs), + ) if forceObservedSourceGC { db.vlogGenerationObservedGCRetryQueued.Add(1) db.queueVlogGenerationObservedSourceGCList(observedSourceGCIDs) @@ -14893,10 +14899,22 @@ planned: // ingest/restore when the flush queue is non-empty. Avoid introducing long // stalls by only running the GC path when the cached write queue is drained. if queueLen != 0 && !forceObservedSourceGC { + db.debugVlogMaintf( + "gc_skip reason=queue_not_drained run_gc=%t queue_len=%d force_observed=%t", + runGC, + queueLen, + forceObservedSourceGC, + ) return } gcer, ok := db.backend.(backendValueLogGCer) if !ok { + db.debugVlogMaintf( + "gc_skip reason=backend_no_gcer run_gc=%t force_observed=%t observed_ids=%d", + runGC, + forceObservedSourceGC, + len(observedSourceGCIDs), + ) if forceObservedSourceGC { db.vlogGenerationObservedGCRetryQueued.Add(1) db.queueVlogGenerationObservedSourceGCList(observedSourceGCIDs) @@ -14909,6 +14927,14 @@ planned: if lastGC > 0 { lastAt := time.Unix(0, lastGC) if !forceObservedSourceGC && now.Sub(lastAt) < vlogGenerationGCMinInterval { + db.debugVlogMaintf( + "gc_skip reason=min_interval run_gc=%t force_observed=%t observed_ids=%d since_ms=%.3f min_ms=%.3f", + runGC, + forceObservedSourceGC, + len(observedSourceGCIDs), + float64(now.Sub(lastAt).Microseconds())/1000, + float64(vlogGenerationGCMinInterval.Microseconds())/1000, + ) return } } @@ -14924,24 +14950,59 @@ planned: return fmt.Errorf("generational gc dry-run: %w", err) } if gcStats.BytesEligible < vlogGenerationGCMinBytes && gcStats.SegmentsEligible == 0 { + db.debugVlogMaintf( + "gc_skip reason=below_eligibility_floor run_gc=%t force_observed=%t eligible_bytes=%d eligible_segments=%d min_bytes=%d", + runGC, + forceObservedSourceGC, + gcStats.BytesEligible, + gcStats.SegmentsEligible, + vlogGenerationGCMinBytes, + ) return nil } } now := time.Now() db.vlogGenerationLastGCUnixNano.Store(now.UnixNano()) - ctx, cancel := db.foregroundMaintenanceContext(30 * time.Second) + ctx, cancel := db.vlogGenerationMaintenanceContext(30*time.Second, opts) gcOpts := db.valueLogGCOptions(false) if forceObservedSourceGC { gcOpts.ObservedSourceFileIDs = append([]uint32(nil), observedSourceGCIDs...) db.vlogGenerationObservedGCRuns.Add(1) } + db.debugVlogMaintf( + "gc_run start run_gc=%t force_observed=%t observed_ids=%d need_estimate=%t", + runGC, + forceObservedSourceGC, + len(observedSourceGCIDs), + needEligibilityEstimate, + ) gcStart := time.Now() gcStats, err := gcer.ValueLogGC(ctx, gcOpts) cancel() db.observeVlogGenerationGCExecDuration(time.Since(gcStart)) if err != nil { + db.debugVlogMaintf( + "gc_run err run_gc=%t force_observed=%t observed_ids=%d err=%v", + runGC, + forceObservedSourceGC, + len(observedSourceGCIDs), + err, + ) return fmt.Errorf("generational gc: %w", err) } + db.debugVlogMaintf( + "gc_run done run_gc=%t force_observed=%t observed_ids=%d deleted_segments=%d deleted_bytes=%d protected_retained_bytes=%d observed_segments=%d observed_eligible=%d observed_deleted=%d observed_protected_retained=%d", + runGC, + forceObservedSourceGC, + len(observedSourceGCIDs), + gcStats.SegmentsDeleted, + gcStats.BytesDeleted, + gcStats.BytesProtectedRetained, + gcStats.ObservedSourceSegments, + gcStats.ObservedSourceSegmentsEligible, + gcStats.ObservedSourceSegmentsDeleted, + gcStats.ObservedSourceSegmentsProtectedRetained, + ) db.observeVlogGenerationGCStats(gcStats) if gcStats.BytesProtectedRetained > 0 && gcStats.BytesEligible == 0 && db.valueLogRetainedClosedBytes.Load() > 0 { // When GC classifies all reclaim blockers as retained-path protection, @@ -14955,6 +15016,13 @@ planned: gcStats.ObservedSourceSegments > 0 && gcStats.ObservedSourceSegmentsProtectedRetained > 0 && gcStats.ObservedSourceSegmentsEligible == 0 { + db.debugVlogMaintf( + "gc_observed_retry reason=retained_protected observed_ids=%d observed_segments=%d observed_protected_retained=%d observed_eligible=%d", + len(observedSourceGCIDs), + gcStats.ObservedSourceSegments, + gcStats.ObservedSourceSegmentsProtectedRetained, + gcStats.ObservedSourceSegmentsEligible, + ) db.queueRetainedPruneObservedSourceIDs(observedSourceGCIDs) db.scheduleRetainedValueLogPruneForce() db.vlogGenerationObservedGCRetryQueued.Add(1) @@ -14972,6 +15040,13 @@ planned: return nil }) if err != nil { + db.debugVlogMaintf( + "gc_maintenance_err run_gc=%t force_observed=%t observed_ids=%d err=%v", + runGC, + forceObservedSourceGC, + len(observedSourceGCIDs), + err, + ) if forceObservedSourceGC { db.vlogGenerationObservedGCRetryQueued.Add(1) db.queueVlogGenerationObservedSourceGCList(observedSourceGCIDs) diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go index c05292311..9c5cd67ff 100644 --- a/TreeDB/caching/vlog_generation_scheduler_test.go +++ b/TreeDB/caching/vlog_generation_scheduler_test.go @@ -741,6 +741,7 @@ type rewriteBudgetRecordingBackend struct { gcResponse backenddb.ValueLogGCStats gcResponses []backenddb.ValueLogGCStats gcErr error + gcFn func(context.Context, backenddb.ValueLogGCOptions) (backenddb.ValueLogGCStats, error) } func (b *rewriteBudgetRecordingBackend) ValueLogRewritePlan(ctx context.Context, opts backenddb.ValueLogRewriteOnlineOptions) (backenddb.ValueLogRewritePlan, error) { @@ -770,6 +771,7 @@ func (b *rewriteBudgetRecordingBackend) ValueLogGC(ctx context.Context, opts bac b.mu.Lock() b.gcCalls++ b.gcOpts = append(b.gcOpts, cloneGCOptsForTest(opts)) + customFn := b.gcFn stats := b.gcResponse if len(b.gcResponses) > 0 { idx := b.gcCalls - 1 @@ -783,6 +785,9 @@ func (b *rewriteBudgetRecordingBackend) ValueLogGC(ctx context.Context, opts bac } err := b.gcErr b.mu.Unlock() + if customFn != nil { + return customFn(ctx, opts) + } return stats, err } @@ -1181,6 +1186,66 @@ func TestVlogGenerationObservedSourceGCQueue_CountersAndDedupe(t *testing.T) { } } +func TestVlogGenerationMaintenance_ObservedSourceGCBypassQuietIgnoresForegroundResume(t *testing.T) { + prepareDirectSchedulerTest(t) + t.Setenv(envDisableVlogGenerationRewrite, "1") + + dir := t.TempDir() + backend, err := backenddb.Open(backenddb.Options{Dir: dir}) + if err != nil { + t.Fatalf("open backend: %v", err) + } + recorder := &rewriteBudgetRecordingBackend{ + DB: backend, + gcFn: func(ctx context.Context, _ backenddb.ValueLogGCOptions) (backenddb.ValueLogGCStats, error) { + select { + case <-time.After(200 * time.Millisecond): + if err := ctx.Err(); err != nil { + return backenddb.ValueLogGCStats{}, err + } + return backenddb.ValueLogGCStats{}, nil + case <-ctx.Done(): + return backenddb.ValueLogGCStats{}, ctx.Err() + } + }, + } + + db, cleanup := openRewriteQueueTestDB(t, dir, recorder) + defer cleanup() + skipRetainedPrune(db) + + db.queueVlogGenerationObservedSourceGCList([]uint32{11}) + db.vlogGenerationLastGCUnixNano.Store(time.Now().Add(-time.Minute).UnixNano()) + forceVlogMaintenanceIdle(db) + + go func() { + time.Sleep(30 * time.Millisecond) + hot := time.Now().UnixNano() + db.lastForegroundWriteUnixNano.Store(hot) + db.lastForegroundReadUnixNano.Store(hot) + }() + + db.maybeRunVlogGenerationMaintenanceWithOptions(true, vlogGenerationMaintenanceOptions{ + bypassQuiet: true, + skipRetainedPruneWait: true, + skipCheckpoint: true, + rewriteDebtDrain: true, + }) + + if got := recorder.recordedGCObservedSourceCalls(); got != 1 { + t.Fatalf("observed-source gc calls=%d want 1", got) + } + if got := db.vlogGenerationGCRuns.Load(); got != 1 { + t.Fatalf("gc runs=%d want 1", got) + } + if got := db.vlogGenerationObservedGCRetryQueued.Load(); got != 0 { + t.Fatalf("observed-source gc retry queued=%d want 0", got) + } + if pending := len(db.takeVlogGenerationObservedSourceGCList()); pending != 0 { + t.Fatalf("observed-source gc pending ids=%d want 0", pending) + } +} + func TestVlogGenerationRewrite_FreshPlanExecIgnoresForegroundCancelUntilBoundedComplete(t *testing.T) { prepareDirectSchedulerTest(t) From 76938986a104ba5b89bb163f7ed25a818130a73a Mon Sep 17 00:00:00 2001 From: Mikers Date: Sat, 28 Mar 2026 05:19:28 -1000 Subject: [PATCH 27/61] tools: add live vlog maintenance capacity analyzer --- docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md | 21 + scripts/analyze_vlog_maintenance_capacity.py | 513 +++++++++++++++++++ worklog/2026-03-28.md | 33 ++ 3 files changed, 567 insertions(+) create mode 100755 scripts/analyze_vlog_maintenance_capacity.py create mode 100644 worklog/2026-03-28.md diff --git a/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md b/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md index 052bd806d..2d0a98274 100644 --- a/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md +++ b/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md @@ -32,6 +32,27 @@ Primary keys: - `treedb.cache.vlog_generation.vacuum.runs` - `treedb.cache.vlog_generation.vacuum.failures` +## Live Run Capacity Report +For `run_celestia`-style runs, analyze the latest diagnostics snapshot with: + +```bash +./scripts/analyze_vlog_maintenance_capacity.py +``` + +Optional explicit input: + +```bash +./scripts/analyze_vlog_maintenance_capacity.py ~/.celestia-app-mainnet-treedb- +./scripts/analyze_vlog_maintenance_capacity.py ~/.celestia-app-mainnet-treedb-/sync/diagnostics/.debug_vars.json +``` + +The report highlights: +- maintenance lane pressure (attempt/acquire/collision + skip mix) +- rewrite plan-to-exec realization +- stale-bytes processed vs immediate reclaim +- observed-source replay drain +- GC eligibility/protection signals + ## Bench Commands ### Churn sanity (TreeDB) ```bash diff --git a/scripts/analyze_vlog_maintenance_capacity.py b/scripts/analyze_vlog_maintenance_capacity.py new file mode 100755 index 000000000..07aa4c926 --- /dev/null +++ b/scripts/analyze_vlog_maintenance_capacity.py @@ -0,0 +1,513 @@ +#!/usr/bin/env python3 +"""Summarize live TreeDB vlog maintenance capacity from run_celestia diagnostics. + +Input can be: +- a run home dir (e.g. ~/.celestia-app-mainnet-treedb-YYYY...) +- a diagnostics dir +- a debug vars JSON file + +By default, the script scans the newest ~/.celestia-app-mainnet-treedb-* home. +""" + +from __future__ import annotations + +import argparse +import glob +import json +import math +import os +import sys +from pathlib import Path +from typing import Any + + +def human_bytes(value: float) -> str: + if value is None or math.isnan(value): + return "n/a" + n = float(value) + if n < 0: + return f"-{human_bytes(-n)}" + units = ["B", "KiB", "MiB", "GiB", "TiB"] + idx = 0 + while n >= 1024.0 and idx < len(units) - 1: + n /= 1024.0 + idx += 1 + if idx == 0: + return f"{int(n)} {units[idx]}" + return f"{n:.2f} {units[idx]}" + + +def pct(num: float, den: float) -> float: + if den <= 0: + return 0.0 + return 100.0 * num / den + + +def safe_int(value: Any, default: int = 0) -> int: + if isinstance(value, bool): + return int(value) + if isinstance(value, int): + return value + if isinstance(value, float): + return int(value) + if isinstance(value, str): + s = value.strip().lower() + if not s: + return default + if s == "true": + return 1 + if s == "false": + return 0 + try: + return int(s) + except ValueError: + try: + return int(float(s)) + except ValueError: + return default + return default + + +def safe_float(value: Any, default: float = 0.0) -> float: + if isinstance(value, bool): + return float(int(value)) + if isinstance(value, (int, float)): + return float(value) + if isinstance(value, str): + s = value.strip().lower() + if not s: + return default + if s == "true": + return 1.0 + if s == "false": + return 0.0 + try: + return float(s) + except ValueError: + return default + return default + + +def pick_latest(paths: list[Path]) -> Path | None: + if not paths: + return None + return max(paths, key=lambda p: p.stat().st_mtime) + + +def find_latest_home() -> Path | None: + homes: list[Path] = [] + for raw in glob.glob(os.path.expanduser("~/.celestia-app-mainnet-treedb-*")): + p = Path(raw) + if p.is_dir(): + homes.append(p) + return pick_latest(homes) + + +def find_diagnostics_file(root: Path) -> Path | None: + roots: list[Path] = [] + if (root / "sync" / "diagnostics").is_dir(): + roots.append(root / "sync" / "diagnostics") + if (root / "diagnostics").is_dir(): + roots.append(root / "diagnostics") + if root.is_dir() and root.name == "diagnostics": + roots.append(root) + + patterns = ["*.debug_vars.json", "*.treedb_vars.json", "*.treedb_application_vars.json"] + + # Prefer richer payload shapes in order. Ignore obviously empty snapshots. + for pat in patterns: + candidates: list[Path] = [] + for diag in roots: + candidates.extend(diag.glob(pat)) + # If caller passed a file-like path prefix directory with JSON files only. + if root.is_dir() and not roots: + candidates.extend(root.glob(pat)) + candidates.sort(key=lambda p: p.stat().st_mtime, reverse=True) + for cand in candidates: + # "{}\n" snapshots are not useful for maintenance analysis. + if cand.stat().st_size <= 4: + continue + return cand + + # Fallback: if all snapshots are tiny/empty, still return the newest one. + fallback: list[Path] = [] + for pat in patterns: + for diag in roots: + fallback.extend(diag.glob(pat)) + if root.is_dir() and not roots: + fallback.extend(root.glob(pat)) + return pick_latest(fallback) + + +def find_home_from_path(path: Path) -> str: + for parent in [path] + list(path.parents): + name = parent.name + if name.startswith(".celestia-app-mainnet-"): + return str(parent) + return "" + + +def choose_instance(instances: dict[str, Any], pattern: str) -> tuple[str, dict[str, Any]]: + if not instances: + return "", {} + + if pattern: + matches = [(k, v) for k, v in instances.items() if pattern in k and isinstance(v, dict)] + if matches: + # Prefer the richest stats object among matches. + matches.sort(key=lambda kv: len(kv[1]), reverse=True) + return matches[0][0], matches[0][1] + + scored: list[tuple[int, int, str, dict[str, Any]]] = [] + for k, v in instances.items(): + if not isinstance(v, dict): + continue + vg_count = sum(1 for key in v.keys() if str(key).startswith("treedb.cache.vlog_generation.")) + scored.append((vg_count, len(v), k, v)) + if scored: + scored.sort(reverse=True) + _, _, k, v = scored[0] + return k, v + + first_key = sorted(instances.keys())[0] + val = instances[first_key] + if isinstance(val, dict): + return first_key, val + return first_key, {} + + +def extract_stats(payload: Any, instance_pattern: str) -> tuple[dict[str, Any], str]: + if not isinstance(payload, dict): + return {}, "" + + # Most complete shape from debug vars snapshots: + # { "treedb": { "instances": { "...": { stats... } } } } + treedb = payload.get("treedb") + if isinstance(treedb, dict): + instances = treedb.get("instances") + if isinstance(instances, dict): + instance_name, stats = choose_instance(instances, instance_pattern) + return stats, instance_name + + # Flat stats map shape. + if any(str(k).startswith("treedb.cache.") for k in payload.keys()): + return payload, "" + + # Other possible shape: top-level instances. + instances = payload.get("instances") + if isinstance(instances, dict): + instance_name, stats = choose_instance(instances, instance_pattern) + return stats, instance_name + + return {}, "" + + +def metric_int(stats: dict[str, Any], key: str) -> int: + return safe_int(stats.get(key, 0), 0) + + +def metric_float(stats: dict[str, Any], key: str) -> float: + return safe_float(stats.get(key, 0.0), 0.0) + + +def build_summary(stats: dict[str, Any]) -> dict[str, Any]: + m = { + "maintenance_attempts": metric_int(stats, "treedb.cache.vlog_generation.maintenance.attempts"), + "maintenance_acquired": metric_int(stats, "treedb.cache.vlog_generation.maintenance.acquired"), + "maintenance_collisions": metric_int(stats, "treedb.cache.vlog_generation.maintenance.collisions"), + "maintenance_noop": metric_int(stats, "treedb.cache.vlog_generation.maintenance.passes.noop"), + "maintenance_with_rewrite": metric_int(stats, "treedb.cache.vlog_generation.maintenance.passes.with_rewrite"), + "maintenance_with_gc": metric_int(stats, "treedb.cache.vlog_generation.maintenance.passes.with_gc"), + "rewrite_runs": metric_int(stats, "treedb.cache.vlog_generation.rewrite.runs"), + "rewrite_plan_runs": metric_int(stats, "treedb.cache.vlog_generation.rewrite.plan_runs"), + "rewrite_plan_selected": metric_int(stats, "treedb.cache.vlog_generation.rewrite.plan_selected"), + "rewrite_plan_empty": metric_int(stats, "treedb.cache.vlog_generation.rewrite.plan_empty"), + "rewrite_plan_selected_segments_total": metric_int(stats, "treedb.cache.vlog_generation.rewrite.plan_selected_segments_total"), + "rewrite_exec_source_segments_total": metric_int(stats, "treedb.cache.vlog_generation.rewrite.exec.source_segments_total"), + "rewrite_plan_selected_bytes_stale": metric_int(stats, "treedb.cache.vlog_generation.rewrite.plan_selected_bytes_stale"), + "rewrite_processed_stale_bytes": metric_int(stats, "treedb.cache.vlog_generation.rewrite.processed_stale_bytes"), + "rewrite_processed_live_bytes": metric_int(stats, "treedb.cache.vlog_generation.rewrite.processed_live_bytes"), + "rewrite_bytes_in": metric_int(stats, "treedb.cache.vlog_generation.rewrite.bytes_in"), + "rewrite_bytes_out": metric_int(stats, "treedb.cache.vlog_generation.rewrite.bytes_out"), + "rewrite_reclaimed_bytes": metric_int(stats, "treedb.cache.vlog_generation.rewrite.reclaimed_bytes"), + "rewrite_no_reclaim_runs": metric_int(stats, "treedb.cache.vlog_generation.rewrite.no_reclaim_runs"), + "rewrite_exec_total_ms": metric_float(stats, "treedb.cache.vlog_generation.rewrite.exec.total_ms"), + "rewrite_exec_avg_ms": metric_float(stats, "treedb.cache.vlog_generation.rewrite.exec.avg_ms"), + "rewrite_ledger_bytes_total": metric_int(stats, "treedb.cache.vlog_generation.rewrite.ledger_bytes_total"), + "rewrite_ledger_bytes_stale": metric_int(stats, "treedb.cache.vlog_generation.rewrite.ledger_bytes_stale"), + "rewrite_ledger_segments": metric_int(stats, "treedb.cache.vlog_generation.rewrite.ledger_segments"), + "rewrite_age_blocked_remaining_ms": metric_int(stats, "treedb.cache.vlog_generation.rewrite.age_blocked_remaining_ms"), + "rewrite_penalties_active": metric_int(stats, "treedb.cache.vlog_generation.rewrite.penalties_active"), + "rewrite_budget_consumed_bytes_total": metric_int(stats, "treedb.cache.vlog_generation.rewrite_budget.consumed_bytes_total"), + "rewrite_budget_tokens_utilization_pct": metric_float(stats, "treedb.cache.vlog_generation.rewrite_budget.tokens_utilization_pct"), + "gc_runs": metric_int(stats, "treedb.cache.vlog_generation.gc.runs"), + "gc_deleted_bytes": metric_int(stats, "treedb.cache.vlog_generation.gc.deleted_bytes"), + "gc_deleted_segments": metric_int(stats, "treedb.cache.vlog_generation.gc.deleted_segments"), + "gc_exec_total_ms": metric_float(stats, "treedb.cache.vlog_generation.gc.exec.total_ms"), + "gc_exec_avg_ms": metric_float(stats, "treedb.cache.vlog_generation.gc.exec.avg_ms"), + "gc_last_eligible_bytes": metric_int(stats, "treedb.cache.vlog_generation.gc.last_eligible_bytes"), + "gc_last_pending_bytes": metric_int(stats, "treedb.cache.vlog_generation.gc.last_pending_bytes"), + "gc_last_protected_retained_bytes": metric_int(stats, "treedb.cache.vlog_generation.gc.last_protected_retained_bytes"), + "observed_gc_pending_ids": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.pending_ids"), + "observed_gc_queued_ids": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.queued_ids"), + "observed_gc_taken_ids": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.taken_ids"), + "observed_gc_runs": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.runs"), + "observed_gc_retry_queued": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.retry_queued"), + "checkpoint_kick_runs": metric_int(stats, "treedb.cache.vlog_generation.checkpoint_kick.runs"), + "checkpoint_kick_gc_runs": metric_int(stats, "treedb.cache.vlog_generation.checkpoint_kick.gc_runs"), + "checkpoint_kick_rewrite_runs": metric_int(stats, "treedb.cache.vlog_generation.checkpoint_kick.rewrite_runs"), + } + + skip_keys = [ + "treedb.cache.vlog_generation.maintenance.skip.wal_on_periodic", + "treedb.cache.vlog_generation.maintenance.skip.maintenance_phase", + "treedb.cache.vlog_generation.maintenance.skip.stage_gate", + "treedb.cache.vlog_generation.maintenance.skip.stage_gate_not_due", + "treedb.cache.vlog_generation.maintenance.skip.stage_gate_due_reserved", + "treedb.cache.vlog_generation.maintenance.skip.age_blocked_gate", + "treedb.cache.vlog_generation.maintenance.skip.priority_pending", + "treedb.cache.vlog_generation.maintenance.skip.quiet_window", + "treedb.cache.vlog_generation.maintenance.skip.before_first_checkpoint", + "treedb.cache.vlog_generation.maintenance.skip.checkpoint_inflight", + ] + skip_map = {k.split(".")[-1]: metric_int(stats, k) for k in skip_keys} + m["maintenance_skip"] = skip_map + m["maintenance_skip_total"] = sum(skip_map.values()) + + passes_total = m["maintenance_noop"] + m["maintenance_with_rewrite"] + m["maintenance_with_gc"] + m["maintenance_passes_total"] = passes_total + m["maintenance_acquire_rate_pct"] = pct(m["maintenance_acquired"], m["maintenance_attempts"]) + m["maintenance_collision_rate_pct"] = pct(m["maintenance_collisions"], m["maintenance_attempts"]) + m["maintenance_rewrite_pass_share_pct"] = pct(m["maintenance_with_rewrite"], passes_total) + m["maintenance_gc_pass_share_pct"] = pct(m["maintenance_with_gc"], passes_total) + + m["rewrite_plan_select_rate_pct"] = pct(m["rewrite_plan_selected"], m["rewrite_plan_runs"]) + m["rewrite_segment_realization_pct"] = pct( + m["rewrite_exec_source_segments_total"], + m["rewrite_plan_selected_segments_total"], + ) + m["rewrite_stale_selection_coverage_pct"] = pct( + m["rewrite_processed_stale_bytes"], + m["rewrite_plan_selected_bytes_stale"], + ) + m["rewrite_immediate_reclaim_pct"] = pct( + m["rewrite_reclaimed_bytes"], + m["rewrite_processed_stale_bytes"], + ) + m["rewrite_stale_not_reclaimed_bytes"] = max( + 0, + m["rewrite_processed_stale_bytes"] - m["rewrite_reclaimed_bytes"], + ) + rewrite_secs = m["rewrite_exec_total_ms"] / 1000.0 + m["rewrite_exec_throughput_bytes_per_sec"] = ( + (m["rewrite_bytes_in"] / rewrite_secs) if rewrite_secs > 0 else 0.0 + ) + + gc_secs = m["gc_exec_total_ms"] / 1000.0 + m["gc_delete_throughput_bytes_per_sec"] = ( + (m["gc_deleted_bytes"] / gc_secs) if gc_secs > 0 else 0.0 + ) + + m["observed_gc_drain_pct"] = pct(m["observed_gc_taken_ids"], m["observed_gc_queued_ids"]) + + return m + + +def print_report(summary: dict[str, Any], source_file: Path, run_home: str, instance_name: str) -> None: + print(f"Source file: {source_file}") + if run_home: + print(f"Run home: {run_home}") + if instance_name: + print(f"Instance: {instance_name}") + print("") + + print("Maintenance lane") + print( + " attempts/acquired/collisions: " + f"{summary['maintenance_attempts']} / {summary['maintenance_acquired']} / {summary['maintenance_collisions']} " + f"(acquire={summary['maintenance_acquire_rate_pct']:.1f}%, collision={summary['maintenance_collision_rate_pct']:.1f}%)" + ) + print( + " passes: " + f"total={summary['maintenance_passes_total']} " + f"noop={summary['maintenance_noop']} " + f"rewrite={summary['maintenance_with_rewrite']} " + f"gc={summary['maintenance_with_gc']} " + f"(rewrite_share={summary['maintenance_rewrite_pass_share_pct']:.1f}%, gc_share={summary['maintenance_gc_pass_share_pct']:.1f}%)" + ) + skips = summary["maintenance_skip"] + print( + " skip pressure: " + f"total={summary['maintenance_skip_total']} " + f"stage_gate={skips['stage_gate']} " + f"stage_not_due={skips['stage_gate_not_due']} " + f"age_blocked={skips['age_blocked_gate']} " + f"quiet={skips['quiet_window']} " + f"checkpoint={skips['checkpoint_inflight']}" + ) + print("") + + print("Rewrite economics") + print( + " plan runs/selected/empty: " + f"{summary['rewrite_plan_runs']} / {summary['rewrite_plan_selected']} / {summary['rewrite_plan_empty']} " + f"(select_rate={summary['rewrite_plan_select_rate_pct']:.1f}%)" + ) + print( + " selected->executed segments: " + f"{summary['rewrite_plan_selected_segments_total']} -> {summary['rewrite_exec_source_segments_total']} " + f"(realization={summary['rewrite_segment_realization_pct']:.1f}%)" + ) + print( + " selected stale vs processed stale: " + f"{human_bytes(summary['rewrite_plan_selected_bytes_stale'])} -> {human_bytes(summary['rewrite_processed_stale_bytes'])} " + f"(coverage={summary['rewrite_stale_selection_coverage_pct']:.1f}%)" + ) + print( + " bytes in/out/reclaimed: " + f"{human_bytes(summary['rewrite_bytes_in'])} / {human_bytes(summary['rewrite_bytes_out'])} / {human_bytes(summary['rewrite_reclaimed_bytes'])}" + ) + print( + " stale processed w/o immediate reclaim: " + f"{human_bytes(summary['rewrite_stale_not_reclaimed_bytes'])} " + f"(immediate_reclaim={summary['rewrite_immediate_reclaim_pct']:.2f}%, no_reclaim_runs={summary['rewrite_no_reclaim_runs']})" + ) + print( + " exec: " + f"runs={summary['rewrite_runs']} total_ms={summary['rewrite_exec_total_ms']:.3f} avg_ms={summary['rewrite_exec_avg_ms']:.3f} " + f"throughput={human_bytes(summary['rewrite_exec_throughput_bytes_per_sec'])}/s" + ) + print( + " debt/budget: " + f"ledger={human_bytes(summary['rewrite_ledger_bytes_total'])} (stale={human_bytes(summary['rewrite_ledger_bytes_stale'])}, segs={summary['rewrite_ledger_segments']}) " + f"age_blocked_ms={summary['rewrite_age_blocked_remaining_ms']} penalties={summary['rewrite_penalties_active']} " + f"budget_consumed={human_bytes(summary['rewrite_budget_consumed_bytes_total'])} " + f"budget_util={summary['rewrite_budget_tokens_utilization_pct']:.1f}%" + ) + print("") + + print("GC economics") + print( + " runs/deleted: " + f"{summary['gc_runs']} / {summary['gc_deleted_segments']} segments, {human_bytes(summary['gc_deleted_bytes'])}" + ) + print( + " exec: " + f"total_ms={summary['gc_exec_total_ms']:.3f} avg_ms={summary['gc_exec_avg_ms']:.3f} " + f"delete_throughput={human_bytes(summary['gc_delete_throughput_bytes_per_sec'])}/s" + ) + print( + " last eligibility/protection: " + f"eligible={human_bytes(summary['gc_last_eligible_bytes'])} " + f"pending={human_bytes(summary['gc_last_pending_bytes'])} " + f"protected_retained={human_bytes(summary['gc_last_protected_retained_bytes'])}" + ) + print( + " checkpoint-kick: " + f"runs={summary['checkpoint_kick_runs']} rewrite_runs={summary['checkpoint_kick_rewrite_runs']} gc_runs={summary['checkpoint_kick_gc_runs']}" + ) + print("") + + print("Observed-source replay") + print( + " queued/taken/pending ids: " + f"{summary['observed_gc_queued_ids']} / {summary['observed_gc_taken_ids']} / {summary['observed_gc_pending_ids']} " + f"(drain={summary['observed_gc_drain_pct']:.1f}%, retries={summary['observed_gc_retry_queued']}, runs={summary['observed_gc_runs']})" + ) + + print("") + notes: list[str] = [] + if summary["rewrite_processed_stale_bytes"] > 0 and summary["rewrite_reclaimed_bytes"] == 0: + notes.append("rewrite copied stale bytes but immediate reclaim is zero; inspect GC eligibility/protection and post-run rewrite window") + if summary["observed_gc_pending_ids"] > 0: + notes.append("observed-source GC backlog still pending; may need longer run window or higher checkpoint-kick pressure") + if summary["maintenance_collision_rate_pct"] > 20.0: + notes.append("maintenance collision rate is high; lane contention may be throttling rewrite/GC progress") + if summary["rewrite_segment_realization_pct"] < 60.0 and summary["rewrite_plan_selected_segments_total"] > 0: + notes.append("rewrite segment realization is low; staged debt is being selected faster than executed") + if not notes: + notes.append("no obvious maintenance-lane bottleneck signature in this snapshot") + + print("Signals") + for note in notes: + print(f" - {note}") + + +def parse_args() -> argparse.Namespace: + p = argparse.ArgumentParser(description="Analyze TreeDB live vlog maintenance capacity from run_celestia diagnostics") + p.add_argument( + "input", + nargs="?", + help="run home dir, diagnostics dir, or debug vars JSON file (default: latest ~/.celestia-app-mainnet-treedb-*)", + ) + p.add_argument( + "--instance-pattern", + default="application.db", + help="prefer instance names containing this substring when debug_vars has multiple DB instances", + ) + p.add_argument("--json", action="store_true", help="emit JSON summary instead of text report") + return p.parse_args() + + +def resolve_source(input_arg: str | None) -> Path: + if input_arg: + p = Path(os.path.expanduser(input_arg)).resolve() + if not p.exists(): + raise FileNotFoundError(f"input does not exist: {p}") + if p.is_file(): + return p + src = find_diagnostics_file(p) + if src is None: + raise FileNotFoundError(f"no diagnostics JSON found under: {p}") + return src + + home = find_latest_home() + if home is None: + raise FileNotFoundError("no ~/.celestia-app-mainnet-treedb-* directories found") + src = find_diagnostics_file(home) + if src is None: + raise FileNotFoundError(f"no diagnostics JSON found under: {home}") + return src + + +def main() -> int: + args = parse_args() + try: + source = resolve_source(args.input) + except FileNotFoundError as exc: + print(f"error: {exc}", file=sys.stderr) + return 2 + + try: + payload = json.loads(source.read_text(encoding="utf-8")) + except Exception as exc: + print(f"error: failed to parse JSON from {source}: {exc}", file=sys.stderr) + return 2 + + stats, instance_name = extract_stats(payload, args.instance_pattern) + if not stats: + print( + "error: could not extract treedb stats map from JSON (expected debug_vars shape or flat stats map)", + file=sys.stderr, + ) + return 2 + + summary = build_summary(stats) + run_home = find_home_from_path(source) + + if args.json: + out = { + "source_file": str(source), + "run_home": run_home, + "instance": instance_name, + "summary": summary, + } + print(json.dumps(out, indent=2, sort_keys=True)) + else: + print_report(summary, source, run_home, instance_name) + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/worklog/2026-03-28.md b/worklog/2026-03-28.md new file mode 100644 index 000000000..ecef3eff7 --- /dev/null +++ b/worklog/2026-03-28.md @@ -0,0 +1,33 @@ +# 2026-03-28 + +- Added a repeatable live-maintenance capacity analyzer: + - `scripts/analyze_vlog_maintenance_capacity.py` + - Input modes: + - latest run home auto-discovery (default) + - explicit run home dir + - explicit diagnostics JSON snapshot + - Prefers `*.debug_vars.json` snapshots and the `application.db` instance in multi-instance payloads. + - Emits derived signals for: + - maintenance lane pressure (attempt/acquire/collision + skip mix) + - rewrite plan select rate and selected->executed realization + - selected stale bytes vs processed stale bytes + - immediate reclaim ratio + - observed-source replay queue drain + - GC eligibility/protection summary + +- Updated runbook docs with command + usage: + - `docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md` + - Added `Live Run Capacity Report` section. + +- Validation run (existing Celestia home): + - command: + - `./scripts/analyze_vlog_maintenance_capacity.py /home/mikers/.celestia-app-mainnet-treedb-20260328050437` + - key outputs: + - `maintenance attempts/acquired/collisions = 74 / 74 / 0` + - `rewrite plan runs/selected/empty = 9 / 4 / 5` + - `selected->executed segments = 14 -> 7 (50.0%)` + - `selected stale -> processed stale = 2.91 GiB -> 1.46 GiB (50.0%)` + - `rewrite reclaimed bytes = 0 B` with `processed stale = 1.46 GiB` + - `observed-source queued/taken/pending ids = 29 / 29 / 0` + - interpretation: + - forced observed-source replay now drains cleanly, but the dominant remaining bottleneck is still zero immediate reclaim despite substantial stale rewrite processing. From 431d323f84a87851611739b70671164b17513352 Mon Sep 17 00:00:00 2001 From: Mikers Date: Sat, 28 Mar 2026 05:30:48 -1000 Subject: [PATCH 28/61] worklog: record high-budget live rewrite run --- worklog/2026-03-28.md | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/worklog/2026-03-28.md b/worklog/2026-03-28.md index ecef3eff7..86925f34a 100644 --- a/worklog/2026-03-28.md +++ b/worklog/2026-03-28.md @@ -31,3 +31,29 @@ - `observed-source queued/taken/pending ids = 29 / 29 / 0` - interpretation: - forced observed-source replay now drains cleanly, but the dominant remaining bottleneck is still zero immediate reclaim despite substantial stale rewrite processing. + +- Live run: higher rewrite budget pass (same fast profile) + - command: + - `LOCAL_GOMAP_DIR=/home/mikers/dev/snissn/gomap-phasehook-active TREEDB_OPEN_PROFILE=fast TREEDB_VLOG_REWRITE_BUDGET_BYTES_PER_SEC=536870912 FREEZE_REMOTE_HEIGHT_AT_START=1 ~/run_celestia.sh` + - run home: + - `/home/mikers/.celestia-app-mainnet-treedb-20260328052003` + - analyzer readout: + - `./scripts/analyze_vlog_maintenance_capacity.py /home/mikers/.celestia-app-mainnet-treedb-20260328052003` + - delta vs prior run (`/home/mikers/.celestia-app-mainnet-treedb-20260328050437`): + - `rewrite.exec.source_segments_total`: `7 -> 8` + - `rewrite.segment_realization_pct`: `50.0% -> 61.5%` + - `rewrite.processed_stale_bytes`: `1.46 GiB -> 1.60 GiB` + - `rewrite.bytes_in`: `300.41 MiB -> 408.30 MiB` + - `rewrite exec throughput`: `9.72 MiB/s -> 13.22 MiB/s` + - `rewrite.reclaimed_bytes`: stayed `0 B` + - interpretation: + - Higher budget improves rewrite execution throughput and plan-to-exec realization, but does not solve the core in-run reclaim issue (GC eligibility still zero at final snapshot). + +- Post-run offline rewrite on the higher-budget home: + - command: + - `/home/mikers/dev/snissn/celestia-app-p4/build/treemap-local vlog-rewrite /home/mikers/.celestia-app-mainnet-treedb-20260328052003/data/application.db -rw` + - output: + - `vlog-rewrite: segments_before=16 segments_after=17 bytes_before=3496705485 bytes_after=2168049697 records=1011649` + - post-rewrite size/gzip: + - `du -sb`: `2208117397` + - `tar|gzip|wc -c`: `1781585169` From f587805ed0a24728b4049f740200d809ce22e5e5 Mon Sep 17 00:00:00 2001 From: Mikers Date: Sat, 28 Mar 2026 05:55:10 -1000 Subject: [PATCH 29/61] caching: expose cumulative observed-source gc totals --- TreeDB/caching/db.go | 21 +++++++++ .../caching/vlog_generation_scheduler_test.go | 28 ++++++++++++ scripts/analyze_vlog_maintenance_capacity.py | 44 +++++++++++++++++++ worklog/2026-03-28.md | 35 +++++++++++++++ 4 files changed, 128 insertions(+) diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go index 287f08e17..b654355df 100644 --- a/TreeDB/caching/db.go +++ b/TreeDB/caching/db.go @@ -5643,6 +5643,13 @@ type DB struct { vlogGenerationObservedGCTakenIDs atomic.Uint64 vlogGenerationObservedGCRuns atomic.Uint64 vlogGenerationObservedGCRetryQueued atomic.Uint64 + vlogGenerationObservedGCSourceSegmentsTotal atomic.Uint64 + vlogGenerationObservedGCSourceSegmentsEligibleTotal atomic.Uint64 + vlogGenerationObservedGCSourceSegmentsDeletedTotal atomic.Uint64 + vlogGenerationObservedGCSourceBytesTotal atomic.Int64 + vlogGenerationObservedGCSourceBytesEligibleTotal atomic.Int64 + vlogGenerationObservedGCSourceBytesDeletedTotal atomic.Int64 + vlogGenerationObservedGCSourceBytesProtectedRetainedTotal atomic.Int64 retainedPruneMu sync.Mutex retainedPruneDone chan struct{} vlogGenerationRemapSuccesses atomic.Uint64 @@ -13080,6 +13087,13 @@ func (db *DB) observeVlogGenerationGCStats(stats backenddb.ValueLogGCStats) { db.vlogGenerationLastGCObservedSourceBytesEligible.Store(stats.ObservedSourceBytesEligible) db.vlogGenerationLastGCObservedSourceBytesDeleted.Store(stats.ObservedSourceBytesDeleted) db.vlogGenerationLastGCObservedSourceBytesPending.Store(stats.ObservedSourceBytesPending) + db.vlogGenerationObservedGCSourceSegmentsTotal.Add(uint64(stats.ObservedSourceSegments)) + db.vlogGenerationObservedGCSourceSegmentsEligibleTotal.Add(uint64(stats.ObservedSourceSegmentsEligible)) + db.vlogGenerationObservedGCSourceSegmentsDeletedTotal.Add(uint64(stats.ObservedSourceSegmentsDeleted)) + db.vlogGenerationObservedGCSourceBytesTotal.Add(stats.ObservedSourceBytes) + db.vlogGenerationObservedGCSourceBytesEligibleTotal.Add(stats.ObservedSourceBytesEligible) + db.vlogGenerationObservedGCSourceBytesDeletedTotal.Add(stats.ObservedSourceBytesDeleted) + db.vlogGenerationObservedGCSourceBytesProtectedRetainedTotal.Add(stats.ObservedSourceBytesProtectedRetained) } func (db *DB) observeVlogGenerationVacuumExecDuration(d time.Duration) { @@ -20766,6 +20780,13 @@ func (db *DB) Stats() map[string]string { stats["treedb.cache.vlog_generation.observed_gc.taken_ids"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCTakenIDs.Load()) stats["treedb.cache.vlog_generation.observed_gc.runs"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCRuns.Load()) stats["treedb.cache.vlog_generation.observed_gc.retry_queued"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCRetryQueued.Load()) + stats["treedb.cache.vlog_generation.observed_gc.source_segments_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceSegmentsTotal.Load()) + stats["treedb.cache.vlog_generation.observed_gc.source_segments_eligible_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceSegmentsEligibleTotal.Load()) + stats["treedb.cache.vlog_generation.observed_gc.source_segments_deleted_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceSegmentsDeletedTotal.Load()) + stats["treedb.cache.vlog_generation.observed_gc.source_bytes_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceBytesTotal.Load()) + stats["treedb.cache.vlog_generation.observed_gc.source_bytes_eligible_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceBytesEligibleTotal.Load()) + stats["treedb.cache.vlog_generation.observed_gc.source_bytes_deleted_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceBytesDeletedTotal.Load()) + stats["treedb.cache.vlog_generation.observed_gc.source_bytes_protected_retained_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceBytesProtectedRetainedTotal.Load()) stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_total"] = fmt.Sprintf("%d", db.vlogGenerationRewriteExecSourceSegments.Load()) stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_requested_total"] = fmt.Sprintf("%d", db.vlogGenerationRewriteSourceSegmentsRequestedTotal.Load()) stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_still_referenced_total"] = fmt.Sprintf("%d", db.vlogGenerationRewriteSourceSegmentsStillReferencedTotal.Load()) diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go index 9c5cd67ff..b2305abc4 100644 --- a/TreeDB/caching/vlog_generation_scheduler_test.go +++ b/TreeDB/caching/vlog_generation_scheduler_test.go @@ -6089,6 +6089,13 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) { db.vlogGenerationObservedGCTakenIDs.Store(9) db.vlogGenerationObservedGCRuns.Store(3) db.vlogGenerationObservedGCRetryQueued.Store(2) + db.vlogGenerationObservedGCSourceSegmentsTotal.Store(11) + db.vlogGenerationObservedGCSourceSegmentsEligibleTotal.Store(5) + db.vlogGenerationObservedGCSourceSegmentsDeletedTotal.Store(3) + db.vlogGenerationObservedGCSourceBytesTotal.Store(1100) + db.vlogGenerationObservedGCSourceBytesEligibleTotal.Store(500) + db.vlogGenerationObservedGCSourceBytesDeletedTotal.Store(300) + db.vlogGenerationObservedGCSourceBytesProtectedRetainedTotal.Store(250) db.vlogGenerationRewriteQueueMu.Lock() db.vlogGenerationRewriteQueueLoaded = true @@ -6372,4 +6379,25 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) { if got := stats["treedb.cache.vlog_generation.observed_gc.retry_queued"]; got != "2" { t.Fatalf("observed gc retry queued=%q want 2", got) } + if got := stats["treedb.cache.vlog_generation.observed_gc.source_segments_total"]; got != "11" { + t.Fatalf("observed gc source segments total=%q want 11", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.source_segments_eligible_total"]; got != "5" { + t.Fatalf("observed gc source segments eligible total=%q want 5", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.source_segments_deleted_total"]; got != "3" { + t.Fatalf("observed gc source segments deleted total=%q want 3", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.source_bytes_total"]; got != "1100" { + t.Fatalf("observed gc source bytes total=%q want 1100", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.source_bytes_eligible_total"]; got != "500" { + t.Fatalf("observed gc source bytes eligible total=%q want 500", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.source_bytes_deleted_total"]; got != "300" { + t.Fatalf("observed gc source bytes deleted total=%q want 300", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.source_bytes_protected_retained_total"]; got != "250" { + t.Fatalf("observed gc source bytes protected retained total=%q want 250", got) + } } diff --git a/scripts/analyze_vlog_maintenance_capacity.py b/scripts/analyze_vlog_maintenance_capacity.py index 07aa4c926..509e2f508 100755 --- a/scripts/analyze_vlog_maintenance_capacity.py +++ b/scripts/analyze_vlog_maintenance_capacity.py @@ -253,6 +253,13 @@ def build_summary(stats: dict[str, Any]) -> dict[str, Any]: "observed_gc_taken_ids": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.taken_ids"), "observed_gc_runs": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.runs"), "observed_gc_retry_queued": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.retry_queued"), + "observed_gc_source_segments_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_segments_total"), + "observed_gc_source_segments_eligible_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_segments_eligible_total"), + "observed_gc_source_segments_deleted_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_segments_deleted_total"), + "observed_gc_source_bytes_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_bytes_total"), + "observed_gc_source_bytes_eligible_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_bytes_eligible_total"), + "observed_gc_source_bytes_deleted_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_bytes_deleted_total"), + "observed_gc_source_bytes_protected_retained_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_bytes_protected_retained_total"), "checkpoint_kick_runs": metric_int(stats, "treedb.cache.vlog_generation.checkpoint_kick.runs"), "checkpoint_kick_gc_runs": metric_int(stats, "treedb.cache.vlog_generation.checkpoint_kick.gc_runs"), "checkpoint_kick_rewrite_runs": metric_int(stats, "treedb.cache.vlog_generation.checkpoint_kick.rewrite_runs"), @@ -309,6 +316,26 @@ def build_summary(stats: dict[str, Any]) -> dict[str, Any]: ) m["observed_gc_drain_pct"] = pct(m["observed_gc_taken_ids"], m["observed_gc_queued_ids"]) + m["observed_gc_source_segments_eligible_pct"] = pct( + m["observed_gc_source_segments_eligible_total"], + m["observed_gc_source_segments_total"], + ) + m["observed_gc_source_segments_deleted_pct"] = pct( + m["observed_gc_source_segments_deleted_total"], + m["observed_gc_source_segments_total"], + ) + m["observed_gc_source_bytes_eligible_pct"] = pct( + m["observed_gc_source_bytes_eligible_total"], + m["observed_gc_source_bytes_total"], + ) + m["observed_gc_source_bytes_deleted_pct"] = pct( + m["observed_gc_source_bytes_deleted_total"], + m["observed_gc_source_bytes_total"], + ) + m["observed_gc_source_bytes_deleted_of_eligible_pct"] = pct( + m["observed_gc_source_bytes_deleted_total"], + m["observed_gc_source_bytes_eligible_total"], + ) return m @@ -414,6 +441,23 @@ def print_report(summary: dict[str, Any], source_file: Path, run_home: str, inst f"{summary['observed_gc_queued_ids']} / {summary['observed_gc_taken_ids']} / {summary['observed_gc_pending_ids']} " f"(drain={summary['observed_gc_drain_pct']:.1f}%, retries={summary['observed_gc_retry_queued']}, runs={summary['observed_gc_runs']})" ) + print( + " observed-source totals: " + f"segments total={summary['observed_gc_source_segments_total']} " + f"eligible={summary['observed_gc_source_segments_eligible_total']} " + f"deleted={summary['observed_gc_source_segments_deleted_total']} " + f"(eligible_pct={summary['observed_gc_source_segments_eligible_pct']:.1f}%, deleted_pct={summary['observed_gc_source_segments_deleted_pct']:.1f}%)" + ) + print( + " observed-source bytes: " + f"total={human_bytes(summary['observed_gc_source_bytes_total'])} " + f"eligible={human_bytes(summary['observed_gc_source_bytes_eligible_total'])} " + f"deleted={human_bytes(summary['observed_gc_source_bytes_deleted_total'])} " + f"protected_retained={human_bytes(summary['observed_gc_source_bytes_protected_retained_total'])} " + f"(eligible_pct={summary['observed_gc_source_bytes_eligible_pct']:.1f}%, " + f"deleted_pct={summary['observed_gc_source_bytes_deleted_pct']:.1f}%, " + f"deleted_of_eligible={summary['observed_gc_source_bytes_deleted_of_eligible_pct']:.1f}%)" + ) print("") notes: list[str] = [] diff --git a/worklog/2026-03-28.md b/worklog/2026-03-28.md index 86925f34a..4ae00e805 100644 --- a/worklog/2026-03-28.md +++ b/worklog/2026-03-28.md @@ -57,3 +57,38 @@ - post-rewrite size/gzip: - `du -sb`: `2208117397` - `tar|gzip|wc -c`: `1781585169` + +- Added observed-source GC cumulative totals to TreeDB stats + analyzer: + - new stats keys: + - `treedb.cache.vlog_generation.observed_gc.source_segments_total` + - `treedb.cache.vlog_generation.observed_gc.source_segments_eligible_total` + - `treedb.cache.vlog_generation.observed_gc.source_segments_deleted_total` + - `treedb.cache.vlog_generation.observed_gc.source_bytes_total` + - `treedb.cache.vlog_generation.observed_gc.source_bytes_eligible_total` + - `treedb.cache.vlog_generation.observed_gc.source_bytes_deleted_total` + - `treedb.cache.vlog_generation.observed_gc.source_bytes_protected_retained_total` + - analyzer now reports observed-source cumulative eligible/deleted percentages. + +- Validation run with forced rewrite trigger to exercise observed-source path: + - command: + - `LOCAL_GOMAP_DIR=/home/mikers/dev/snissn/gomap-phasehook-active TREEDB_OPEN_PROFILE=fast TREEDB_VLOG_REWRITE_BUDGET_BYTES_PER_SEC=536870912 TREEDB_VLOG_REWRITE_TRIGGER_TOTAL_BYTES=1073741824 FREEZE_REMOTE_HEIGHT_AT_START=1 ~/run_celestia.sh` + - run home: + - `/home/mikers/.celestia-app-mainnet-treedb-20260328054206` + - analyzer highlights: + - rewrite: `plan_selected_segments_total=2`, `exec.source_segments_total=2`, `processed_stale_bytes=475.06 MiB` + - `rewrite.reclaimed_bytes=0` + - observed replay queue: `queued/taken/pending ids = 14 / 14 / 0` + - observed-source totals: + - segments: `total=12 eligible=0 deleted=0` + - bytes: `total=3.00 GiB eligible=0 B deleted=0 B protected_retained=3.00 GiB` + - interpretation: + - This confirms the bottleneck signature in-run is observed-source bytes remaining retained-protected (never becoming GC-eligible in the measured window), not queue drain failure. + +- Post-run offline checks on same home: + - `vlog-gc -rw`: + - `segments total=22 referenced=22 eligible=0 deleted=0 bytes_total=4737495161 bytes_eligible=0 bytes_deleted=0` + - `vlog-rewrite -rw`: + - `segments_before=22 segments_after=17 bytes_before=4737495161 bytes_after=2199392731 records=1021293` + - post-rewrite size/gzip: + - `du -sb`: `2239722809` + - `tar|gzip|wc -c`: `1805021465` From 4a959bb4adbff225018fb4380bb0fa69eacc2176 Mon Sep 17 00:00:00 2001 From: Mikers Date: Sat, 28 Mar 2026 05:56:51 -1000 Subject: [PATCH 30/61] tools: include retained-prune outcomes in capacity report --- scripts/analyze_vlog_maintenance_capacity.py | 34 ++++++++++++++++++++ worklog/2026-03-28.md | 4 +++ 2 files changed, 38 insertions(+) diff --git a/scripts/analyze_vlog_maintenance_capacity.py b/scripts/analyze_vlog_maintenance_capacity.py index 509e2f508..bc2df3e06 100755 --- a/scripts/analyze_vlog_maintenance_capacity.py +++ b/scripts/analyze_vlog_maintenance_capacity.py @@ -248,6 +248,19 @@ def build_summary(stats: dict[str, Any]) -> dict[str, Any]: "gc_last_eligible_bytes": metric_int(stats, "treedb.cache.vlog_generation.gc.last_eligible_bytes"), "gc_last_pending_bytes": metric_int(stats, "treedb.cache.vlog_generation.gc.last_pending_bytes"), "gc_last_protected_retained_bytes": metric_int(stats, "treedb.cache.vlog_generation.gc.last_protected_retained_bytes"), + "retained_prune_closed_bytes": metric_int(stats, "treedb.cache.vlog_retained_prune.closed_bytes"), + "retained_prune_runs": metric_int(stats, "treedb.cache.vlog_retained_prune.runs"), + "retained_prune_forced_runs": metric_int(stats, "treedb.cache.vlog_retained_prune.forced_runs"), + "retained_prune_candidate_segments": metric_int(stats, "treedb.cache.vlog_retained_prune.candidate_segments"), + "retained_prune_candidate_bytes": metric_int(stats, "treedb.cache.vlog_retained_prune.candidate_bytes"), + "retained_prune_removed_segments": metric_int(stats, "treedb.cache.vlog_retained_prune.removed_segments"), + "retained_prune_removed_bytes": metric_int(stats, "treedb.cache.vlog_retained_prune.removed_bytes"), + "retained_prune_in_use_skipped_segments": metric_int(stats, "treedb.cache.vlog_retained_prune.in_use_skipped_segments"), + "retained_prune_in_use_skipped_bytes": metric_int(stats, "treedb.cache.vlog_retained_prune.in_use_skipped_bytes"), + "retained_prune_live_skipped_segments": metric_int(stats, "treedb.cache.vlog_retained_prune.live_skipped_segments"), + "retained_prune_live_skipped_bytes": metric_int(stats, "treedb.cache.vlog_retained_prune.live_skipped_bytes"), + "retained_prune_zombie_marked_segments": metric_int(stats, "treedb.cache.vlog_retained_prune.zombie_marked_segments"), + "retained_prune_zombie_marked_bytes": metric_int(stats, "treedb.cache.vlog_retained_prune.zombie_marked_bytes"), "observed_gc_pending_ids": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.pending_ids"), "observed_gc_queued_ids": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.queued_ids"), "observed_gc_taken_ids": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.taken_ids"), @@ -336,6 +349,14 @@ def build_summary(stats: dict[str, Any]) -> dict[str, Any]: m["observed_gc_source_bytes_deleted_total"], m["observed_gc_source_bytes_eligible_total"], ) + m["retained_prune_removed_candidate_segments_pct"] = pct( + m["retained_prune_removed_segments"], + m["retained_prune_candidate_segments"], + ) + m["retained_prune_removed_candidate_bytes_pct"] = pct( + m["retained_prune_removed_bytes"], + m["retained_prune_candidate_bytes"], + ) return m @@ -433,6 +454,19 @@ def print_report(summary: dict[str, Any], source_file: Path, run_home: str, inst " checkpoint-kick: " f"runs={summary['checkpoint_kick_runs']} rewrite_runs={summary['checkpoint_kick_rewrite_runs']} gc_runs={summary['checkpoint_kick_gc_runs']}" ) + print( + " retained-prune: " + f"runs={summary['retained_prune_runs']} forced={summary['retained_prune_forced_runs']} closed={human_bytes(summary['retained_prune_closed_bytes'])} " + f"candidates={summary['retained_prune_candidate_segments']} ({human_bytes(summary['retained_prune_candidate_bytes'])}) " + f"removed={summary['retained_prune_removed_segments']} ({human_bytes(summary['retained_prune_removed_bytes'])}) " + f"(seg_removed_pct={summary['retained_prune_removed_candidate_segments_pct']:.1f}%, bytes_removed_pct={summary['retained_prune_removed_candidate_bytes_pct']:.1f}%)" + ) + print( + " retained-prune skips: " + f"in_use={summary['retained_prune_in_use_skipped_segments']} ({human_bytes(summary['retained_prune_in_use_skipped_bytes'])}) " + f"live={summary['retained_prune_live_skipped_segments']} ({human_bytes(summary['retained_prune_live_skipped_bytes'])}) " + f"zombie_marked={summary['retained_prune_zombie_marked_segments']} ({human_bytes(summary['retained_prune_zombie_marked_bytes'])})" + ) print("") print("Observed-source replay") diff --git a/worklog/2026-03-28.md b/worklog/2026-03-28.md index 4ae00e805..5713c7665 100644 --- a/worklog/2026-03-28.md +++ b/worklog/2026-03-28.md @@ -81,6 +81,10 @@ - observed-source totals: - segments: `total=12 eligible=0 deleted=0` - bytes: `total=3.00 GiB eligible=0 B deleted=0 B protected_retained=3.00 GiB` + - retained-prune summary: + - `runs=2 forced=2 closed=4.75 GiB` + - `candidates=7 (1.75 GiB) removed=0` + - skips: `in_use=6`, `live=5 (1.25 GiB)`, `zombie_marked=2 (512 MiB)` - interpretation: - This confirms the bottleneck signature in-run is observed-source bytes remaining retained-protected (never becoming GC-eligible in the measured window), not queue drain failure. From e7dd2a3448482969ec233d0d967ed198b57ab5ff Mon Sep 17 00:00:00 2001 From: Mikers Date: Sat, 28 Mar 2026 06:19:05 -1000 Subject: [PATCH 31/61] caching: accelerate observed-source retained prune pacing --- TreeDB/caching/db.go | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go index b654355df..7302e0b3e 100644 --- a/TreeDB/caching/db.go +++ b/TreeDB/caching/db.go @@ -4639,6 +4639,16 @@ func (db *DB) takeRetainedPruneObservedSourceIDs() map[uint32]struct{} { return out } +func (db *DB) retainedPruneObservedSourcePending() bool { + if db == nil { + return false + } + db.retainedPruneObservedMu.Lock() + pending := len(db.retainedPruneObservedSourceIDs) > 0 + db.retainedPruneObservedMu.Unlock() + return pending +} + func (db *DB) queueVlogGenerationObservedSourceGCList(ids []uint32) { if db == nil || len(ids) == 0 { return @@ -4782,8 +4792,12 @@ func (db *DB) scheduleRetainedValueLogPruneWithForce(force bool) { } db.checkpointMu.Unlock() now := time.Now() + minInterval := retainedPruneMinInterval + if effectiveForce && db.retainedPruneObservedSourcePending() { + minInterval = retainedPruneObservedMinInterval + } last := db.retainedPruneLastStartUnixNano.Load() - if last > 0 && now.Sub(time.Unix(0, last)) < retainedPruneMinInterval { + if last > 0 && now.Sub(time.Unix(0, last)) < minInterval { db.retainedValueLogPruneScheduleSkipMinInterval.Add(1) return } @@ -5934,6 +5948,10 @@ const ( // Retained-path prune is opportunistic reclaim. Do not restart a full live-ID // scan on every periodic checkpoint during a hot workload. retainedPruneMinInterval = 30 * time.Second + // Rewrite-observed source IDs can quickly re-trigger forced retained-prune + // requests while replay GC is trying to converge. Allow a faster cadence for + // that targeted path without dropping the generic min-interval guard. + retainedPruneObservedMinInterval = 3 * time.Second // Coordinate index vacuum with major rewrite windows; do not run on every GC. vlogGenerationVacuumTriggerRewriteBytes = int64(64 << 20) vlogGenerationVacuumMinInterval = 5 * time.Minute From e7ef33865a97d02e957d1148f5033fd67a2a9a3c Mon Sep 17 00:00:00 2001 From: Mikers Date: Sat, 28 Mar 2026 07:04:27 -1000 Subject: [PATCH 32/61] caching: add observed-prune and zombie lifecycle diagnostics --- TreeDB/caching/db.go | 80 +++++++++++++++++ TreeDB/internal/valuelog/manager.go | 44 ++++++++++ TreeDB/internal/valuelog/manager_test.go | 32 +++++++ docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md | 3 + scripts/analyze_vlog_maintenance_capacity.py | 91 ++++++++++++++++++++ worklog/2026-03-28.md | 66 ++++++++++++++ 6 files changed, 316 insertions(+) diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go index 7302e0b3e..14241a7c0 100644 --- a/TreeDB/caching/db.go +++ b/TreeDB/caching/db.go @@ -4243,6 +4243,48 @@ func (db *DB) observeRetainedValueLogPruneStats(pruneStats retainedValueLogPrune db.retainedValueLogPruneLastObservedSourceParseSkippedBytes.Store(pruneStats.ObservedSourceParseSkippedBytes) db.retainedValueLogPruneLastObservedSourceZombieMarkedSegments.Store(int64(pruneStats.ObservedSourceZombieMarkedSegments)) db.retainedValueLogPruneLastObservedSourceZombieMarkedBytes.Store(pruneStats.ObservedSourceZombieMarkedBytes) + if pruneStats.ObservedSourceSegments > 0 { + db.retainedValueLogPruneObservedSourceSegmentsTotal.Add(uint64(pruneStats.ObservedSourceSegments)) + } + if pruneStats.ObservedSourceBytes > 0 { + db.retainedValueLogPruneObservedSourceBytesTotal.Add(pruneStats.ObservedSourceBytes) + } + if pruneStats.ObservedSourceCandidateSegments > 0 { + db.retainedValueLogPruneObservedSourceCandidateSegmentsTotal.Add(uint64(pruneStats.ObservedSourceCandidateSegments)) + } + if pruneStats.ObservedSourceCandidateBytes > 0 { + db.retainedValueLogPruneObservedSourceCandidateBytesTotal.Add(pruneStats.ObservedSourceCandidateBytes) + } + if pruneStats.ObservedSourceRemovedSegments > 0 { + db.retainedValueLogPruneObservedSourceRemovedSegmentsTotal.Add(uint64(pruneStats.ObservedSourceRemovedSegments)) + } + if pruneStats.ObservedSourceRemovedBytes > 0 { + db.retainedValueLogPruneObservedSourceRemovedBytesTotal.Add(pruneStats.ObservedSourceRemovedBytes) + } + if pruneStats.ObservedSourceInUseSkippedSegments > 0 { + db.retainedValueLogPruneObservedSourceInUseSkippedSegmentsTotal.Add(uint64(pruneStats.ObservedSourceInUseSkippedSegments)) + } + if pruneStats.ObservedSourceInUseSkippedBytes > 0 { + db.retainedValueLogPruneObservedSourceInUseSkippedBytesTotal.Add(pruneStats.ObservedSourceInUseSkippedBytes) + } + if pruneStats.ObservedSourceLiveSkippedSegments > 0 { + db.retainedValueLogPruneObservedSourceLiveSkippedSegmentsTotal.Add(uint64(pruneStats.ObservedSourceLiveSkippedSegments)) + } + if pruneStats.ObservedSourceLiveSkippedBytes > 0 { + db.retainedValueLogPruneObservedSourceLiveSkippedBytesTotal.Add(pruneStats.ObservedSourceLiveSkippedBytes) + } + if pruneStats.ObservedSourceParseSkippedSegments > 0 { + db.retainedValueLogPruneObservedSourceParseSkippedSegmentsTotal.Add(uint64(pruneStats.ObservedSourceParseSkippedSegments)) + } + if pruneStats.ObservedSourceParseSkippedBytes > 0 { + db.retainedValueLogPruneObservedSourceParseSkippedBytesTotal.Add(pruneStats.ObservedSourceParseSkippedBytes) + } + if pruneStats.ObservedSourceZombieMarkedSegments > 0 { + db.retainedValueLogPruneObservedSourceZombieMarkedSegmentsTotal.Add(uint64(pruneStats.ObservedSourceZombieMarkedSegments)) + } + if pruneStats.ObservedSourceZombieMarkedBytes > 0 { + db.retainedValueLogPruneObservedSourceZombieMarkedBytesTotal.Add(pruneStats.ObservedSourceZombieMarkedBytes) + } if pruneStats.RetriedWithoutWriteGate { db.retainedValueLogPruneWriteGateRetries.Add(1) if pruneStats.RetrySucceeded { @@ -5637,6 +5679,20 @@ type DB struct { retainedValueLogPruneLastObservedSourceParseSkippedBytes atomic.Int64 retainedValueLogPruneLastObservedSourceZombieMarkedSegments atomic.Int64 retainedValueLogPruneLastObservedSourceZombieMarkedBytes atomic.Int64 + retainedValueLogPruneObservedSourceSegmentsTotal atomic.Uint64 + retainedValueLogPruneObservedSourceBytesTotal atomic.Int64 + retainedValueLogPruneObservedSourceCandidateSegmentsTotal atomic.Uint64 + retainedValueLogPruneObservedSourceCandidateBytesTotal atomic.Int64 + retainedValueLogPruneObservedSourceRemovedSegmentsTotal atomic.Uint64 + retainedValueLogPruneObservedSourceRemovedBytesTotal atomic.Int64 + retainedValueLogPruneObservedSourceInUseSkippedSegmentsTotal atomic.Uint64 + retainedValueLogPruneObservedSourceInUseSkippedBytesTotal atomic.Int64 + retainedValueLogPruneObservedSourceLiveSkippedSegmentsTotal atomic.Uint64 + retainedValueLogPruneObservedSourceLiveSkippedBytesTotal atomic.Int64 + retainedValueLogPruneObservedSourceParseSkippedSegmentsTotal atomic.Uint64 + retainedValueLogPruneObservedSourceParseSkippedBytesTotal atomic.Int64 + retainedValueLogPruneObservedSourceZombieMarkedSegmentsTotal atomic.Uint64 + retainedValueLogPruneObservedSourceZombieMarkedBytesTotal atomic.Int64 retainedValueLogPruneScheduleRequests atomic.Uint64 retainedValueLogPruneScheduleForcedRequests atomic.Uint64 retainedValueLogPruneScheduleSkipClosing atomic.Uint64 @@ -20676,6 +20732,20 @@ func (db *DB) Stats() map[string]string { stats["treedb.cache.vlog_retained_prune.last_observed_source.bytes_parse_skipped"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceParseSkippedBytes.Load()) stats["treedb.cache.vlog_retained_prune.last_observed_source.segments_zombie_marked"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceZombieMarkedSegments.Load()) stats["treedb.cache.vlog_retained_prune.last_observed_source.bytes_zombie_marked"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceZombieMarkedBytes.Load()) + stats["treedb.cache.vlog_retained_prune.observed_source.segments_total"] = fmt.Sprintf("%d", db.retainedValueLogPruneObservedSourceSegmentsTotal.Load()) + stats["treedb.cache.vlog_retained_prune.observed_source.bytes_total"] = fmt.Sprintf("%d", db.retainedValueLogPruneObservedSourceBytesTotal.Load()) + stats["treedb.cache.vlog_retained_prune.observed_source.segments_candidate_total"] = fmt.Sprintf("%d", db.retainedValueLogPruneObservedSourceCandidateSegmentsTotal.Load()) + stats["treedb.cache.vlog_retained_prune.observed_source.bytes_candidate_total"] = fmt.Sprintf("%d", db.retainedValueLogPruneObservedSourceCandidateBytesTotal.Load()) + stats["treedb.cache.vlog_retained_prune.observed_source.segments_removed_total"] = fmt.Sprintf("%d", db.retainedValueLogPruneObservedSourceRemovedSegmentsTotal.Load()) + stats["treedb.cache.vlog_retained_prune.observed_source.bytes_removed_total"] = fmt.Sprintf("%d", db.retainedValueLogPruneObservedSourceRemovedBytesTotal.Load()) + stats["treedb.cache.vlog_retained_prune.observed_source.segments_in_use_skipped_total"] = fmt.Sprintf("%d", db.retainedValueLogPruneObservedSourceInUseSkippedSegmentsTotal.Load()) + stats["treedb.cache.vlog_retained_prune.observed_source.bytes_in_use_skipped_total"] = fmt.Sprintf("%d", db.retainedValueLogPruneObservedSourceInUseSkippedBytesTotal.Load()) + stats["treedb.cache.vlog_retained_prune.observed_source.segments_live_skipped_total"] = fmt.Sprintf("%d", db.retainedValueLogPruneObservedSourceLiveSkippedSegmentsTotal.Load()) + stats["treedb.cache.vlog_retained_prune.observed_source.bytes_live_skipped_total"] = fmt.Sprintf("%d", db.retainedValueLogPruneObservedSourceLiveSkippedBytesTotal.Load()) + stats["treedb.cache.vlog_retained_prune.observed_source.segments_parse_skipped_total"] = fmt.Sprintf("%d", db.retainedValueLogPruneObservedSourceParseSkippedSegmentsTotal.Load()) + stats["treedb.cache.vlog_retained_prune.observed_source.bytes_parse_skipped_total"] = fmt.Sprintf("%d", db.retainedValueLogPruneObservedSourceParseSkippedBytesTotal.Load()) + stats["treedb.cache.vlog_retained_prune.observed_source.segments_zombie_marked_total"] = fmt.Sprintf("%d", db.retainedValueLogPruneObservedSourceZombieMarkedSegmentsTotal.Load()) + stats["treedb.cache.vlog_retained_prune.observed_source.bytes_zombie_marked_total"] = fmt.Sprintf("%d", db.retainedValueLogPruneObservedSourceZombieMarkedBytesTotal.Load()) stats["treedb.cache.vlog_retained_prune.pressure_bytes"] = fmt.Sprintf("%d", db.retainedPrunePressureBytes()) stats["treedb.cache.vlog_retained_prune.schedule_requests"] = fmt.Sprintf("%d", db.retainedValueLogPruneScheduleRequests.Load()) stats["treedb.cache.vlog_retained_prune.schedule_forced_requests"] = fmt.Sprintf("%d", db.retainedValueLogPruneScheduleForcedRequests.Load()) @@ -21060,6 +21130,16 @@ func (db *DB) Stats() map[string]string { stats["treedb.cache.vlog_mmap.dead_mappings.cap_base"] = fmt.Sprintf("%d", valuelog.MaxDeadMappings) stats["treedb.cache.vlog_mmap.max_mapped_sealed_segments"] = fmt.Sprintf("%d", valuelog.MaxMappedSealedSegments) stats["treedb.cache.vlog_mmap.max_mapped_sealed_bytes"] = fmt.Sprintf("%d", valuelog.MaxMappedSealedBytes) + zombieSegments, zombieBytes, zombiePinnedSegments, zombiePinnedBytes, zombieUnpinnedSegments, zombieUnpinnedBytes := db.valueLogReader.ZombieStats() + stats["treedb.cache.vlog_zombie.segments"] = fmt.Sprintf("%d", zombieSegments) + stats["treedb.cache.vlog_zombie.bytes"] = fmt.Sprintf("%d", zombieBytes) + stats["treedb.cache.vlog_zombie.pinned_segments"] = fmt.Sprintf("%d", zombiePinnedSegments) + stats["treedb.cache.vlog_zombie.pinned_bytes"] = fmt.Sprintf("%d", zombiePinnedBytes) + stats["treedb.cache.vlog_zombie.unpinned_segments"] = fmt.Sprintf("%d", zombieUnpinnedSegments) + stats["treedb.cache.vlog_zombie.unpinned_bytes"] = fmt.Sprintf("%d", zombieUnpinnedBytes) + stats["treedb.process.memory.vlog_zombie_bytes_estimate"] = fmt.Sprintf("%d", zombieBytes) + stats["treedb.process.memory.vlog_zombie_pinned_bytes_estimate"] = fmt.Sprintf("%d", zombiePinnedBytes) + stats["treedb.process.memory.vlog_zombie_unpinned_bytes_estimate"] = fmt.Sprintf("%d", zombieUnpinnedBytes) stats["treedb.cache.vlog_mmap.active_segments"] = fmt.Sprintf("%d", cacheVlogMmap.activeSegments) stats["treedb.cache.vlog_mmap.active_bytes"] = fmt.Sprintf("%d", cacheVlogMmap.activeBytes) stats["treedb.cache.vlog_mmap.current_segments"] = fmt.Sprintf("%d", cacheVlogMmap.currentSegments) diff --git a/TreeDB/internal/valuelog/manager.go b/TreeDB/internal/valuelog/manager.go index 21c3d5663..230bcef3a 100644 --- a/TreeDB/internal/valuelog/manager.go +++ b/TreeDB/internal/valuelog/manager.go @@ -1290,6 +1290,50 @@ func (m *Manager) RemapStats() (remaps uint64, deadMappings uint64) { return remaps, deadMappings } +func valueLogFileSizeBestEffort(f *File) uint64 { + if f == nil { + return 0 + } + if known := f.fileSize.Load(); known > 0 { + return uint64(known) + } + if data, _ := f.mmapData.Load().([]byte); len(data) > 0 { + return uint64(len(data)) + } + if f.Path != "" { + if info, err := os.Stat(f.Path); err == nil && info.Size() > 0 { + return uint64(info.Size()) + } + } + return 0 +} + +// ZombieStats reports tracked zombie segments and their approximate byte totals. +// A zombie remains on disk until all snapshots release it (RefCount reaches 0). +func (m *Manager) ZombieStats() (segments uint64, bytes uint64, pinnedSegments uint64, pinnedBytes uint64, unpinnedSegments uint64, unpinnedBytes uint64) { + if m == nil { + return 0, 0, 0, 0, 0, 0 + } + m.mu.RLock() + for _, f := range m.files { + if f == nil || !f.IsZombie.Load() { + continue + } + segments++ + size := valueLogFileSizeBestEffort(f) + bytes += size + if f.RefCount.Load() > 0 { + pinnedSegments++ + pinnedBytes += size + continue + } + unpinnedSegments++ + unpinnedBytes += size + } + m.mu.RUnlock() + return segments, bytes, pinnedSegments, pinnedBytes, unpinnedSegments, unpinnedBytes +} + // MmapResidencyStats reports aggregate mmap residency split by segment type: // current writable segments, sealed segments, and dead mappings/bytes. func (m *Manager) MmapResidencyStats() (currentSegments uint64, currentBytes uint64, sealedSegments uint64, sealedBytes uint64, deadMappings uint64, deadBytes uint64) { diff --git a/TreeDB/internal/valuelog/manager_test.go b/TreeDB/internal/valuelog/manager_test.go index d6cd2e780..e2b3fd43c 100644 --- a/TreeDB/internal/valuelog/manager_test.go +++ b/TreeDB/internal/valuelog/manager_test.go @@ -92,6 +92,38 @@ func TestManagerMmapResidencyStatsAggregatesCounters(t *testing.T) { } } +func TestManagerZombieStatsAggregatesPinnedAndUnpinned(t *testing.T) { + mgr := &Manager{ + files: map[uint32]*File{ + 1: {}, + 2: {}, + 3: {}, + }, + } + // Zombie + pinned. + mgr.files[1].IsZombie.Store(true) + mgr.files[1].RefCount.Store(2) + mgr.files[1].fileSize.Store(100) + // Zombie + unpinned. + mgr.files[2].IsZombie.Store(true) + mgr.files[2].RefCount.Store(0) + mgr.files[2].fileSize.Store(200) + // Non-zombie should be ignored. + mgr.files[3].RefCount.Store(9) + mgr.files[3].fileSize.Store(300) + + segments, bytes, pinnedSegments, pinnedBytes, unpinnedSegments, unpinnedBytes := mgr.ZombieStats() + if segments != 2 || bytes != 300 { + t.Fatalf("ZombieStats total mismatch: segments=%d bytes=%d want segments=2 bytes=300", segments, bytes) + } + if pinnedSegments != 1 || pinnedBytes != 100 { + t.Fatalf("ZombieStats pinned mismatch: segments=%d bytes=%d want segments=1 bytes=100", pinnedSegments, pinnedBytes) + } + if unpinnedSegments != 1 || unpinnedBytes != 200 { + t.Fatalf("ZombieStats unpinned mismatch: segments=%d bytes=%d want segments=1 bytes=200", unpinnedSegments, unpinnedBytes) + } +} + func TestManagerPromoteCurrentWritable_SwitchesPriorLaneSegmentToSealed(t *testing.T) { mgr := &Manager{ files: make(map[uint32]*File), diff --git a/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md b/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md index 2d0a98274..3a783b9ce 100644 --- a/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md +++ b/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md @@ -49,8 +49,11 @@ Optional explicit input: The report highlights: - maintenance lane pressure (attempt/acquire/collision + skip mix) - rewrite plan-to-exec realization +- rewrite source outcomes (requested vs still-referenced vs unreferenced) - stale-bytes processed vs immediate reclaim - observed-source replay drain +- observed-source retained-prune outcomes (candidate/live-skipped/zombie-marked/removed) +- zombie inventory (pinned vs unpinned bytes) - GC eligibility/protection signals ## Bench Commands diff --git a/scripts/analyze_vlog_maintenance_capacity.py b/scripts/analyze_vlog_maintenance_capacity.py index bc2df3e06..78c6731da 100755 --- a/scripts/analyze_vlog_maintenance_capacity.py +++ b/scripts/analyze_vlog_maintenance_capacity.py @@ -224,6 +224,12 @@ def build_summary(stats: dict[str, Any]) -> dict[str, Any]: "rewrite_plan_empty": metric_int(stats, "treedb.cache.vlog_generation.rewrite.plan_empty"), "rewrite_plan_selected_segments_total": metric_int(stats, "treedb.cache.vlog_generation.rewrite.plan_selected_segments_total"), "rewrite_exec_source_segments_total": metric_int(stats, "treedb.cache.vlog_generation.rewrite.exec.source_segments_total"), + "rewrite_exec_source_segments_requested_total": metric_int(stats, "treedb.cache.vlog_generation.rewrite.exec.source_segments_requested_total"), + "rewrite_exec_source_segments_still_referenced_total": metric_int(stats, "treedb.cache.vlog_generation.rewrite.exec.source_segments_still_referenced_total"), + "rewrite_exec_source_segments_unreferenced_total": metric_int(stats, "treedb.cache.vlog_generation.rewrite.exec.source_segments_unreferenced_total"), + "rewrite_exec_source_segments_requested_last": metric_int(stats, "treedb.cache.vlog_generation.rewrite.exec.source_segments_requested_last"), + "rewrite_exec_source_segments_still_referenced_last": metric_int(stats, "treedb.cache.vlog_generation.rewrite.exec.source_segments_still_referenced_last"), + "rewrite_exec_source_segments_unreferenced_last": metric_int(stats, "treedb.cache.vlog_generation.rewrite.exec.source_segments_unreferenced_last"), "rewrite_plan_selected_bytes_stale": metric_int(stats, "treedb.cache.vlog_generation.rewrite.plan_selected_bytes_stale"), "rewrite_processed_stale_bytes": metric_int(stats, "treedb.cache.vlog_generation.rewrite.processed_stale_bytes"), "rewrite_processed_live_bytes": metric_int(stats, "treedb.cache.vlog_generation.rewrite.processed_live_bytes"), @@ -261,6 +267,26 @@ def build_summary(stats: dict[str, Any]) -> dict[str, Any]: "retained_prune_live_skipped_bytes": metric_int(stats, "treedb.cache.vlog_retained_prune.live_skipped_bytes"), "retained_prune_zombie_marked_segments": metric_int(stats, "treedb.cache.vlog_retained_prune.zombie_marked_segments"), "retained_prune_zombie_marked_bytes": metric_int(stats, "treedb.cache.vlog_retained_prune.zombie_marked_bytes"), + "vlog_zombie_segments": metric_int(stats, "treedb.cache.vlog_zombie.segments"), + "vlog_zombie_bytes": metric_int(stats, "treedb.cache.vlog_zombie.bytes"), + "vlog_zombie_pinned_segments": metric_int(stats, "treedb.cache.vlog_zombie.pinned_segments"), + "vlog_zombie_pinned_bytes": metric_int(stats, "treedb.cache.vlog_zombie.pinned_bytes"), + "vlog_zombie_unpinned_segments": metric_int(stats, "treedb.cache.vlog_zombie.unpinned_segments"), + "vlog_zombie_unpinned_bytes": metric_int(stats, "treedb.cache.vlog_zombie.unpinned_bytes"), + "retained_prune_observed_source_segments_total": metric_int(stats, "treedb.cache.vlog_retained_prune.observed_source.segments_total"), + "retained_prune_observed_source_bytes_total": metric_int(stats, "treedb.cache.vlog_retained_prune.observed_source.bytes_total"), + "retained_prune_observed_source_candidate_segments_total": metric_int(stats, "treedb.cache.vlog_retained_prune.observed_source.segments_candidate_total"), + "retained_prune_observed_source_candidate_bytes_total": metric_int(stats, "treedb.cache.vlog_retained_prune.observed_source.bytes_candidate_total"), + "retained_prune_observed_source_removed_segments_total": metric_int(stats, "treedb.cache.vlog_retained_prune.observed_source.segments_removed_total"), + "retained_prune_observed_source_removed_bytes_total": metric_int(stats, "treedb.cache.vlog_retained_prune.observed_source.bytes_removed_total"), + "retained_prune_observed_source_in_use_skipped_segments_total": metric_int(stats, "treedb.cache.vlog_retained_prune.observed_source.segments_in_use_skipped_total"), + "retained_prune_observed_source_in_use_skipped_bytes_total": metric_int(stats, "treedb.cache.vlog_retained_prune.observed_source.bytes_in_use_skipped_total"), + "retained_prune_observed_source_live_skipped_segments_total": metric_int(stats, "treedb.cache.vlog_retained_prune.observed_source.segments_live_skipped_total"), + "retained_prune_observed_source_live_skipped_bytes_total": metric_int(stats, "treedb.cache.vlog_retained_prune.observed_source.bytes_live_skipped_total"), + "retained_prune_observed_source_parse_skipped_segments_total": metric_int(stats, "treedb.cache.vlog_retained_prune.observed_source.segments_parse_skipped_total"), + "retained_prune_observed_source_parse_skipped_bytes_total": metric_int(stats, "treedb.cache.vlog_retained_prune.observed_source.bytes_parse_skipped_total"), + "retained_prune_observed_source_zombie_marked_segments_total": metric_int(stats, "treedb.cache.vlog_retained_prune.observed_source.segments_zombie_marked_total"), + "retained_prune_observed_source_zombie_marked_bytes_total": metric_int(stats, "treedb.cache.vlog_retained_prune.observed_source.bytes_zombie_marked_total"), "observed_gc_pending_ids": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.pending_ids"), "observed_gc_queued_ids": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.queued_ids"), "observed_gc_taken_ids": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.taken_ids"), @@ -306,6 +332,14 @@ def build_summary(stats: dict[str, Any]) -> dict[str, Any]: m["rewrite_exec_source_segments_total"], m["rewrite_plan_selected_segments_total"], ) + m["rewrite_source_unreferenced_pct"] = pct( + m["rewrite_exec_source_segments_unreferenced_total"], + m["rewrite_exec_source_segments_requested_total"], + ) + m["rewrite_source_still_referenced_pct"] = pct( + m["rewrite_exec_source_segments_still_referenced_total"], + m["rewrite_exec_source_segments_requested_total"], + ) m["rewrite_stale_selection_coverage_pct"] = pct( m["rewrite_processed_stale_bytes"], m["rewrite_plan_selected_bytes_stale"], @@ -357,6 +391,26 @@ def build_summary(stats: dict[str, Any]) -> dict[str, Any]: m["retained_prune_removed_bytes"], m["retained_prune_candidate_bytes"], ) + m["retained_prune_observed_removed_candidate_segments_pct"] = pct( + m["retained_prune_observed_source_removed_segments_total"], + m["retained_prune_observed_source_candidate_segments_total"], + ) + m["retained_prune_observed_removed_candidate_bytes_pct"] = pct( + m["retained_prune_observed_source_removed_bytes_total"], + m["retained_prune_observed_source_candidate_bytes_total"], + ) + m["retained_prune_observed_live_skipped_candidate_segments_pct"] = pct( + m["retained_prune_observed_source_live_skipped_segments_total"], + m["retained_prune_observed_source_candidate_segments_total"], + ) + m["retained_prune_observed_live_skipped_candidate_bytes_pct"] = pct( + m["retained_prune_observed_source_live_skipped_bytes_total"], + m["retained_prune_observed_source_candidate_bytes_total"], + ) + m["vlog_zombie_pinned_bytes_pct"] = pct( + m["vlog_zombie_pinned_bytes"], + m["vlog_zombie_bytes"], + ) return m @@ -406,6 +460,16 @@ def print_report(summary: dict[str, Any], source_file: Path, run_home: str, inst f"{summary['rewrite_plan_selected_segments_total']} -> {summary['rewrite_exec_source_segments_total']} " f"(realization={summary['rewrite_segment_realization_pct']:.1f}%)" ) + print( + " source outcomes (exec): " + f"requested_total={summary['rewrite_exec_source_segments_requested_total']} " + f"unreferenced_total={summary['rewrite_exec_source_segments_unreferenced_total']} " + f"still_referenced_total={summary['rewrite_exec_source_segments_still_referenced_total']} " + f"(unref_pct={summary['rewrite_source_unreferenced_pct']:.1f}%, still_ref_pct={summary['rewrite_source_still_referenced_pct']:.1f}%) " + f"last=requested:{summary['rewrite_exec_source_segments_requested_last']} " + f"unref:{summary['rewrite_exec_source_segments_unreferenced_last']} " + f"still_ref:{summary['rewrite_exec_source_segments_still_referenced_last']}" + ) print( " selected stale vs processed stale: " f"{human_bytes(summary['rewrite_plan_selected_bytes_stale'])} -> {human_bytes(summary['rewrite_processed_stale_bytes'])} " @@ -467,6 +531,13 @@ def print_report(summary: dict[str, Any], source_file: Path, run_home: str, inst f"live={summary['retained_prune_live_skipped_segments']} ({human_bytes(summary['retained_prune_live_skipped_bytes'])}) " f"zombie_marked={summary['retained_prune_zombie_marked_segments']} ({human_bytes(summary['retained_prune_zombie_marked_bytes'])})" ) + print( + " zombie inventory: " + f"total={summary['vlog_zombie_segments']} ({human_bytes(summary['vlog_zombie_bytes'])}) " + f"pinned={summary['vlog_zombie_pinned_segments']} ({human_bytes(summary['vlog_zombie_pinned_bytes'])}) " + f"unpinned={summary['vlog_zombie_unpinned_segments']} ({human_bytes(summary['vlog_zombie_unpinned_bytes'])}) " + f"(pinned_bytes_pct={summary['vlog_zombie_pinned_bytes_pct']:.1f}%)" + ) print("") print("Observed-source replay") @@ -492,6 +563,19 @@ def print_report(summary: dict[str, Any], source_file: Path, run_home: str, inst f"deleted_pct={summary['observed_gc_source_bytes_deleted_pct']:.1f}%, " f"deleted_of_eligible={summary['observed_gc_source_bytes_deleted_of_eligible_pct']:.1f}%)" ) + print( + " observed-source retained-prune totals: " + f"seen={summary['retained_prune_observed_source_segments_total']} ({human_bytes(summary['retained_prune_observed_source_bytes_total'])}) " + f"candidate={summary['retained_prune_observed_source_candidate_segments_total']} ({human_bytes(summary['retained_prune_observed_source_candidate_bytes_total'])}) " + f"removed={summary['retained_prune_observed_source_removed_segments_total']} ({human_bytes(summary['retained_prune_observed_source_removed_bytes_total'])}) " + f"zombie_marked={summary['retained_prune_observed_source_zombie_marked_segments_total']} ({human_bytes(summary['retained_prune_observed_source_zombie_marked_bytes_total'])}) " + f"live_skipped={summary['retained_prune_observed_source_live_skipped_segments_total']} ({human_bytes(summary['retained_prune_observed_source_live_skipped_bytes_total'])}) " + f"in_use_skipped={summary['retained_prune_observed_source_in_use_skipped_segments_total']} ({human_bytes(summary['retained_prune_observed_source_in_use_skipped_bytes_total'])}) " + f"(removed_of_candidate={summary['retained_prune_observed_removed_candidate_segments_pct']:.1f}% seg / " + f"{summary['retained_prune_observed_removed_candidate_bytes_pct']:.1f}% bytes, " + f"live_skip_of_candidate={summary['retained_prune_observed_live_skipped_candidate_segments_pct']:.1f}% seg / " + f"{summary['retained_prune_observed_live_skipped_candidate_bytes_pct']:.1f}% bytes)" + ) print("") notes: list[str] = [] @@ -503,6 +587,13 @@ def print_report(summary: dict[str, Any], source_file: Path, run_home: str, inst notes.append("maintenance collision rate is high; lane contention may be throttling rewrite/GC progress") if summary["rewrite_segment_realization_pct"] < 60.0 and summary["rewrite_plan_selected_segments_total"] > 0: notes.append("rewrite segment realization is low; staged debt is being selected faster than executed") + if ( + summary["rewrite_exec_source_segments_unreferenced_total"] > 0 + and summary["retained_prune_observed_source_zombie_marked_segments_total"] > 0 + and summary["observed_gc_source_segments_deleted_total"] == 0 + and summary["vlog_zombie_segments"] == 0 + ): + notes.append("rewrite-selected sources became unreferenced and were zombie-marked, but GC delete counters stayed zero; reclaim likely happened via zombie lifecycle outside GC byte accounting") if not notes: notes.append("no obvious maintenance-lane bottleneck signature in this snapshot") diff --git a/worklog/2026-03-28.md b/worklog/2026-03-28.md index 5713c7665..00a52cf2b 100644 --- a/worklog/2026-03-28.md +++ b/worklog/2026-03-28.md @@ -96,3 +96,69 @@ - post-rewrite size/gzip: - `du -sb`: `2239722809` - `tar|gzip|wc -c`: `1805021465` + +- Added retained-prune observed-source cumulative counters (not just last-run snapshot): + - `treedb.cache.vlog_retained_prune.observed_source.segments_total` + - `treedb.cache.vlog_retained_prune.observed_source.bytes_total` + - `treedb.cache.vlog_retained_prune.observed_source.segments_candidate_total` + - `treedb.cache.vlog_retained_prune.observed_source.bytes_candidate_total` + - `treedb.cache.vlog_retained_prune.observed_source.segments_removed_total` + - `treedb.cache.vlog_retained_prune.observed_source.bytes_removed_total` + - `treedb.cache.vlog_retained_prune.observed_source.segments_in_use_skipped_total` + - `treedb.cache.vlog_retained_prune.observed_source.bytes_in_use_skipped_total` + - `treedb.cache.vlog_retained_prune.observed_source.segments_live_skipped_total` + - `treedb.cache.vlog_retained_prune.observed_source.bytes_live_skipped_total` + - `treedb.cache.vlog_retained_prune.observed_source.segments_parse_skipped_total` + - `treedb.cache.vlog_retained_prune.observed_source.bytes_parse_skipped_total` + - `treedb.cache.vlog_retained_prune.observed_source.segments_zombie_marked_total` + - `treedb.cache.vlog_retained_prune.observed_source.bytes_zombie_marked_total` + +- Added value-log zombie inventory stats from manager: + - `treedb.cache.vlog_zombie.segments` + - `treedb.cache.vlog_zombie.bytes` + - `treedb.cache.vlog_zombie.pinned_segments` + - `treedb.cache.vlog_zombie.pinned_bytes` + - `treedb.cache.vlog_zombie.unpinned_segments` + - `treedb.cache.vlog_zombie.unpinned_bytes` + - plus process-memory estimates for zombie bytes. + +- Analyzer/report updates: + - Include rewrite source outcomes (`requested/still_referenced/unreferenced`). + - Include observed-source retained-prune cumulative outcomes. + - Include zombie inventory (pinned vs unpinned bytes). + - Add signal note when rewrite-selected segments become unreferenced and zombie-marked while GC delete counters remain zero. + +- Validation run (`fast`, forced trigger, new counters) + - command: + - `LOCAL_GOMAP_DIR=/home/mikers/dev/snissn/gomap-phasehook-active TREEDB_OPEN_PROFILE=fast TREEDB_VLOG_REWRITE_BUDGET_BYTES_PER_SEC=536870912 TREEDB_VLOG_REWRITE_TRIGGER_TOTAL_BYTES=134217728 FREEZE_REMOTE_HEIGHT_AT_START=1 ~/run_celestia.sh` + - run home: + - `/home/mikers/.celestia-app-mainnet-treedb-20260328063722` + - analyzer highlights: + - rewrite source outcomes: `requested_total=4 unreferenced_total=4 still_referenced_total=0` + - observed-source retained-prune totals: `seen=4 (1.00 GiB), candidate=4 (1.00 GiB), zombie_marked=4 (1.00 GiB), live_skipped=0` + - observed-source GC cumulative: `total=3.50 GiB, eligible=0, deleted=0, protected_retained=3.50 GiB` + - retained-prune global: `zombie_marked=4 (1.00 GiB)` + - interpretation: + - rewrite-selected source segments are becoming unreferenced and are then being zombie-marked in retained-prune; replay queue is draining. + - zero observed-source GC deleted bytes is not explained by queue starvation or live-skips on observed sources. + +- Second validation run with zombie inventory keys active: + - run home: + - `/home/mikers/.celestia-app-mainnet-treedb-20260328065017` + - analyzer highlights: + - rewrite source outcomes: `requested_total=4 unreferenced_total=4 still_referenced_total=0` + - observed-source retained-prune totals: `seen=4, candidate=4, zombie_marked=4, removed=0, live_skipped=0` + - zombie inventory at final snapshot: `total=0, pinned=0, unpinned=0` + - observed-source GC cumulative still `eligible=0 deleted=0 protected_retained=2.75 GiB` + - interpretation: + - observed-source segments are zombie-marked and eventually not present as tracked zombies by run end, yet GC delete counters remain zero; this indicates reclaim is occurring outside current GC deleted-byte accounting and that the larger disk gap is primarily about how much stale data live rewrite selected during the run. + +- Headroom check on same run (`20260328065017`) via offline rewrite: + - pre: `du -sb maindb/wal = 3805802931` + - command: + - `/home/mikers/dev/snissn/celestia-app-p4/build/treemap-local vlog-rewrite /home/mikers/.celestia-app-mainnet-treedb-20260328065017/data/application.db -rw` + - output: + - `vlog-rewrite: segments_before=20 segments_after=16 bytes_before=3805798835 bytes_after=2068426925 records=983187` + - post: `du -sb maindb/wal = 2068431021` + - implication: + - ~1.74 GiB additional compaction headroom remains versus end-of-live-run size under this workload. From a9e6fc3a9ae07801e33043a3a3d5fb599de0b6fa Mon Sep 17 00:00:00 2001 From: Mikers Date: Sat, 28 Mar 2026 07:42:36 -1000 Subject: [PATCH 33/61] caching: honor configured stale-ratio threshold in generic rewrite --- TreeDB/caching/db.go | 7 ++-- .../caching/vlog_generation_scheduler_test.go | 17 +++++--- worklog/2026-03-28.md | 42 +++++++++++++++++++ 3 files changed, 57 insertions(+), 9 deletions(-) diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go index 14241a7c0..88dd47510 100644 --- a/TreeDB/caching/db.go +++ b/TreeDB/caching/db.go @@ -15440,11 +15440,10 @@ func (db *DB) vlogGenerationRewriteMinStaleRatioForGenericPass(totalBytes int64) if totalBytes < vlogGenerationRewriteEfficacyMinTotalBytes { return 0 } - ratio := vlogGenerationRewriteGenericMinSegmentStaleRatio - if configured := db.vlogGenerationRewriteMinStaleRatioForStaleRatioTrigger(totalBytes); configured > ratio { - ratio = configured + if configured := db.vlogGenerationRewriteMinStaleRatioForStaleRatioTrigger(totalBytes); configured > 0 { + return configured } - return ratio + return vlogGenerationRewriteGenericMinSegmentStaleRatio } func (db *DB) vlogGenerationRewriteMinStaleRatioForQueuedDebt(totalBytes int64, reason uint32) float64 { diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go index b2305abc4..c3e0325e6 100644 --- a/TreeDB/caching/vlog_generation_scheduler_test.go +++ b/TreeDB/caching/vlog_generation_scheduler_test.go @@ -487,16 +487,16 @@ func TestRunVlogGenerationMaintenanceRetries_CoalescesPendingCollisionRetries(t } } -func TestVlogGenerationRewriteMinStaleRatioForGenericPass_UsesQualityFloor(t *testing.T) { +func TestVlogGenerationRewriteMinStaleRatioForGenericPass_UsesConfiguredTriggerRatio(t *testing.T) { db := &DB{valueLogRewriteTriggerRatioPPM: 200000} - if got, want := db.vlogGenerationRewriteMinStaleRatioForGenericPass(8<<30), vlogGenerationRewriteGenericMinSegmentStaleRatio; got != want { + if got, want := db.vlogGenerationRewriteMinStaleRatioForGenericPass(8<<30), 0.50; got != want { t.Fatalf("generic min stale ratio=%f want=%f", got, want) } } -func TestVlogGenerationRewriteMinStaleRatioForGenericPass_UsesHigherConfiguredRatio(t *testing.T) { +func TestVlogGenerationRewriteMinStaleRatioForGenericPass_UsesHigherConfiguredTriggerRatio(t *testing.T) { db := &DB{valueLogRewriteTriggerRatioPPM: 800000} - if got, want := db.vlogGenerationRewriteMinStaleRatioForGenericPass(8<<30), vlogGenerationRewriteGenericMinSegmentStaleRatio; got != want { + if got, want := db.vlogGenerationRewriteMinStaleRatioForGenericPass(8<<30), 0.80; got != want { t.Fatalf("generic min stale ratio=%f want=%f", got, want) } } @@ -515,9 +515,16 @@ func TestVlogGenerationRewriteMinStaleRatioForGenericPass_DisabledBelowEfficacyF } } +func TestVlogGenerationRewriteMinStaleRatioForGenericPass_DefaultWithoutConfiguredTrigger(t *testing.T) { + db := &DB{valueLogRewriteTriggerRatioPPM: 0} + if got, want := db.vlogGenerationRewriteMinStaleRatioForGenericPass(8<<30), vlogGenerationRewriteGenericMinSegmentStaleRatio; got != want { + t.Fatalf("generic min stale ratio=%f want=%f", got, want) + } +} + func TestVlogGenerationRewriteMinStaleRatioForQueuedDebt_UsesGenericFloorForTotalBytes(t *testing.T) { db := &DB{valueLogRewriteTriggerRatioPPM: 200000} - if got, want := db.vlogGenerationRewriteMinStaleRatioForQueuedDebt(8<<30, vlogGenerationReasonTotalBytes), vlogGenerationRewriteGenericMinSegmentStaleRatio; got != want { + if got, want := db.vlogGenerationRewriteMinStaleRatioForQueuedDebt(8<<30, vlogGenerationReasonTotalBytes), 0.50; got != want { t.Fatalf("queued total-bytes min stale ratio=%f want=%f", got, want) } } diff --git a/worklog/2026-03-28.md b/worklog/2026-03-28.md index 00a52cf2b..69db38543 100644 --- a/worklog/2026-03-28.md +++ b/worklog/2026-03-28.md @@ -162,3 +162,45 @@ - post: `du -sb maindb/wal = 2068431021` - implication: - ~1.74 GiB additional compaction headroom remains versus end-of-live-run size under this workload. + +- Stale-ratio trigger sweep (live run_celestia) to isolate rewrite-selection threshold impact: + - low stale ratio path (forces ~0.50 segment threshold): + - command: + - `LOCAL_GOMAP_DIR=/home/mikers/dev/snissn/gomap-phasehook-active TREEDB_OPEN_PROFILE=fast TREEDB_VLOG_REWRITE_BUDGET_BYTES_PER_SEC=536870912 TREEDB_VLOG_REWRITE_TRIGGER_TOTAL_BYTES=0 TREEDB_VLOG_REWRITE_TRIGGER_STALE_RATIO_PPM=100000 FREEZE_REMOTE_HEIGHT_AT_START=1 ~/run_celestia.sh` + - run home: `/home/mikers/.celestia-app-mainnet-treedb-20260328070649` + - analyzer highlights: + - `selected->executed segments = 13 -> 7` + - `processed_stale_bytes = 1.51 GiB` + - end WAL: `3093987987` + - offline rewrite on same home: `3093983891 -> 2128313686` (`du -sb` post `2128317782`) + - high stale ratio control (~0.85 threshold): + - command: + - `... TREEDB_VLOG_REWRITE_TRIGGER_TOTAL_BYTES=0 TREEDB_VLOG_REWRITE_TRIGGER_STALE_RATIO_PPM=850000 ...` + - run home: `/home/mikers/.celestia-app-mainnet-treedb-20260328071500` + - analyzer highlights: + - `selected->executed segments = 4 -> 4` + - `processed_stale_bytes = 983.89 MiB` + - end WAL: `3944887635` + - interpretation: + - lower stale-threshold selection materially improves in-run compaction and closes offline headroom. + +- Code change: allow explicitly configured stale-ratio trigger to drive generic/total-bytes rewrite segment selection threshold. + - file: `TreeDB/caching/db.go` + - changed `vlogGenerationRewriteMinStaleRatioForGenericPass` so when `TREEDB_VLOG_REWRITE_TRIGGER_STALE_RATIO_PPM` is set, generic planning uses that configured threshold (with existing stale-ratio floor behavior), instead of always enforcing the stricter generic constant. + - default behavior remains unchanged when stale-ratio trigger is unset. + - tests updated in `TreeDB/caching/vlog_generation_scheduler_test.go`: + - generic pass uses configured trigger ratio when set + - queued debt under total-bytes reflects configured ratio + - default generic ratio remains unchanged when trigger ratio is unset + +- Validation run after code change with both triggers enabled: + - command: + - `LOCAL_GOMAP_DIR=/home/mikers/dev/snissn/gomap-phasehook-active TREEDB_OPEN_PROFILE=fast TREEDB_VLOG_REWRITE_BUDGET_BYTES_PER_SEC=536870912 TREEDB_VLOG_REWRITE_TRIGGER_TOTAL_BYTES=134217728 TREEDB_VLOG_REWRITE_TRIGGER_STALE_RATIO_PPM=100000 FREEZE_REMOTE_HEIGHT_AT_START=1 ~/run_celestia.sh` + - run home: `/home/mikers/.celestia-app-mainnet-treedb-20260328072830` + - analyzer highlights: + - `selected->executed segments = 8 -> 8` + - `processed_stale_bytes = 1.58 GiB` + - end WAL: `3320308275` (improved vs prior total-bytes-trigger baselines around `3.7-3.9 GiB`) + - offline rewrite on same home: `3320304179 -> 2132071399` (`du -sb` post `2132075495`) + - interpretation: + - the threshold change improves total-bytes-triggered live rewrite coverage while preserving trigger semantics. From 8e9a018a23cdcb4e311b6b9786c2ec90f7eb506f Mon Sep 17 00:00:00 2001 From: Mikers Date: Sat, 28 Mar 2026 08:52:34 -1000 Subject: [PATCH 34/61] caching: add pre-checkpoint rewrite override for WAL-off runs --- TreeDB/caching/db.go | 41 ++++- .../caching/vlog_generation_scheduler_test.go | 172 ++++++++++++++++++ docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md | 7 + scripts/analyze_vlog_maintenance_capacity.py | 2 + worklog/2026-03-28.md | 45 +++++ 5 files changed, 260 insertions(+), 7 deletions(-) diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go index 88dd47510..575e4694e 100644 --- a/TreeDB/caching/db.go +++ b/TreeDB/caching/db.go @@ -1860,6 +1860,10 @@ const ( envDisableVlogGenerationVacuum = "TREEDB_DISABLE_VLOG_GENERATION_VACUUM" envDisableVlogGenerationLoop = "TREEDB_DISABLE_VLOG_GENERATION_LOOP" envDisableVlogGenerationCheckpointKick = "TREEDB_DISABLE_VLOG_GENERATION_CHECKPOINT_KICK" + // Experimental WAL-off override: allow rewrite planning/execution before the + // first explicit checkpoint. Disabled by default because it can add restore + // contention during early state-sync. + envEnableVlogGenerationPreCheckpointRewrite = "TREEDB_ENABLE_VLOG_GENERATION_PRECHECKPOINT_REWRITE" // Diagnostic toggle for WAL-off checkpoint-time sparse-index vacuum. envDisableCheckpointAutoVacuum = "TREEDB_DISABLE_CHECKPOINT_AUTO_VACUUM" minMemtablePrealloc = 64 * 1024 @@ -6039,6 +6043,11 @@ const ( // During checkpoint-kick debt drain, allow a bounded multi-segment rewrite // selection so debt can converge faster than one-segment-per-pass. vlogGenerationRewriteDebtDrainMaxSegments = 8 + // Freshly planned rewrites normally execute one segment to limit immediate + // write amplification. In explicit debt-drain mode, allow a small burst once + // the queue is materially large so convergence does not stall. + vlogGenerationRewriteFreshPlanDebtDrainMinSegments = 4 + vlogGenerationRewriteFreshPlanDebtDrainMaxSegments = 4 ) func (db *DB) flushBackendEntriesCap(totalOps int, sync bool) int { @@ -12964,6 +12973,23 @@ func (db *DB) vlogGenerationRewriteMaxSegmentsForRun(queueLen int, budgetTokens return maxSegments } +func (db *DB) vlogGenerationRewriteMaxSegmentsForFreshPlan(queueLen int, budgetTokens int64, opts vlogGenerationMaintenanceOptions) int { + if db == nil || queueLen <= 1 || !opts.rewriteDebtDrain { + return vlogGenerationRewriteResumeMaxSegments + } + if queueLen < vlogGenerationRewriteFreshPlanDebtDrainMinSegments { + return vlogGenerationRewriteResumeMaxSegments + } + maxSegments := db.vlogGenerationRewriteMaxSegmentsForRun(queueLen, budgetTokens, opts) + if maxSegments > vlogGenerationRewriteFreshPlanDebtDrainMaxSegments { + maxSegments = vlogGenerationRewriteFreshPlanDebtDrainMaxSegments + } + if maxSegments < 1 { + maxSegments = 1 + } + return maxSegments +} + const maxPositiveInt64 = int64(^uint64(0) >> 1) func addClampInt64(cur, add, limit int64) int64 { @@ -14021,7 +14047,8 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog // caused real restore stalls. Keep WAL-on profiles eligible for maintenance // before the first checkpoint; starving that path causes the main value-log // lane to grow unchecked during restore. - if db.disableJournal && db.checkpointRuns.Load() == 0 && !runGC && len(rewriteQueue) == 0 && !opts.skipCheckpoint { + allowPreCheckpointRewrite := envBool(envEnableVlogGenerationPreCheckpointRewrite) + if db.disableJournal && db.checkpointRuns.Load() == 0 && !runGC && len(rewriteQueue) == 0 && !opts.skipCheckpoint && !allowPreCheckpointRewrite { db.vlogGenerationMaintenanceSkipPreCheckpoint.Add(1) return } @@ -14510,12 +14537,12 @@ planned: // Do not debt-drain freshly planned work in the same pass. The only // exception is a confirmed staged rewrite-resume pass, which should // be allowed to consume debt in bounded multi-segment chunks. - allowPlanDebtDrain := reason == vlogGenerationReasonRewriteResume && opts.rewriteDebtDrain - if allowPlanDebtDrain { - rewriteMaxSegments = db.vlogGenerationRewriteMaxSegmentsForRun(len(rewriteQueue), budgetTokens, opts) - } else { - rewriteMaxSegments = vlogGenerationRewriteResumeMaxSegments - } + allowPlanDebtDrain := reason == vlogGenerationReasonRewriteResume && opts.rewriteDebtDrain + if allowPlanDebtDrain { + rewriteMaxSegments = db.vlogGenerationRewriteMaxSegmentsForRun(len(rewriteQueue), budgetTokens, opts) + } else { + rewriteMaxSegments = db.vlogGenerationRewriteMaxSegmentsForFreshPlan(len(rewriteQueue), budgetTokens, opts) + } // If the token bucket is enabled and empty, persist the plan/ledger but // skip running the rewrite until we have budget to spend. if db.vlogGenerationRewriteBudgetEnabled() && budgetTokens <= 0 { diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go index c3e0325e6..692e6b389 100644 --- a/TreeDB/caching/vlog_generation_scheduler_test.go +++ b/TreeDB/caching/vlog_generation_scheduler_test.go @@ -730,6 +730,51 @@ func TestVlogGenerationRewriteMaxSegmentsForRun_ClampsDebtDrainQueue(t *testing. } } +func TestVlogGenerationRewriteMaxSegmentsForFreshPlan_BelowQueueThreshold(t *testing.T) { + db := &DB{ + valueLogRewriteBudgetBytes: 1024, + valueLogGenerationWarmTarget: 256, + } + got := db.vlogGenerationRewriteMaxSegmentsForFreshPlan( + vlogGenerationRewriteFreshPlanDebtDrainMinSegments-1, + 1<<20, + vlogGenerationMaintenanceOptions{rewriteDebtDrain: true, debugSource: "rewrite_age_blocked"}, + ) + if got != vlogGenerationRewriteResumeMaxSegments { + t.Fatalf("fresh-plan queue 2132071399` (`du -sb` post `2132075495`) - interpretation: - the threshold change improves total-bytes-triggered live rewrite coverage while preserving trigger semantics. + +- Follow-up experiments (fresh-plan burst + WAL-off pre-checkpoint gate) + - Added fresh-plan debt-drain burst policy for planned rewrite queues: + - `vlogGenerationRewriteFreshPlanDebtDrainMinSegments=4` + - `vlogGenerationRewriteFreshPlanDebtDrainMaxSegments=4` + - path: `TreeDB/caching/db.go` (`vlogGenerationRewriteMaxSegmentsForFreshPlan`) + - tests: `TestVlogGenerationRewriteMaxSegmentsForFreshPlan_*` + +- Capacity analyzer output improvement: + - `scripts/analyze_vlog_maintenance_capacity.py` now prints `pre_checkpoint` and `priority` in the maintenance skip-pressure line. + +- Root-cause check for no-rewrite outlier: + - run home: `/home/mikers/.celestia-app-mainnet-treedb-20260328082023` + - observed: + - analyzer: `rewrite runs=0`, end WAL `5522118526` + - skip counters (`debug_vars`): `maintenance.skip.before_first_checkpoint=11` + - interpretation: + - WAL-off pre-checkpoint gate can suppress all rewrite activity on some short runs. + +- Added experimental override for WAL-off pre-checkpoint rewrite: + - env: `TREEDB_ENABLE_VLOG_GENERATION_PRECHECKPOINT_REWRITE=1` + - default remains disabled. + - gate change in `TreeDB/caching/db.go` allows bypassing `maintenance.skip.before_first_checkpoint` when env is set. + - docs updated: `docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md`. + - tests added: + - `TestVlogGenerationMaintenance_WALOffPreCheckpointSkipsRewriteByDefault` + - `TestVlogGenerationMaintenance_WALOffPreCheckpointCanRunWithEnvOverride` + +- Validation runs + - Baseline-like run (no override), home `/home/mikers/.celestia-app-mainnet-treedb-20260328075104`: + - end WAL: `3438411416` + - analyzer: `rewrite runs=8`, `selected->executed=8->8`, `processed_stale=1.59 GiB` + - offline rewrite: `3438407320 -> 2171030759` (post `du -sb`: `2171034855`) + - No-rewrite outlier (no override), home `/home/mikers/.celestia-app-mainnet-treedb-20260328082023`: + - end WAL: `5522118526` + - analyzer: `rewrite runs=0`, `pre_checkpoint skip dominated` + - offline rewrite: `5522114430 -> 2205781521` (post `du -sb`: `2205785617`) + - Override run (`TREEDB_ENABLE_VLOG_GENERATION_PRECHECKPOINT_REWRITE=1`), home `/home/mikers/.celestia-app-mainnet-treedb-20260328083336`: + - end WAL: `3477220043` + - analyzer: `pre_checkpoint=0`, `rewrite runs=8`, `selected->executed=9->8`, `processed_stale=1.59 GiB` + - offline rewrite: `3477215947 -> 2238622807` (post `du -sb`: `2238626903`) + +- Takeaway: + - pre-checkpoint gating is a first-order driver of run-to-run variance in live rewrite coverage under WAL-off fast runs. + - enabling the pre-checkpoint override avoids the catastrophic `rewrite runs=0` failure mode and restores expected live rewrite activity. From 8806b4e822f857da184b18653b95ca230baba47a Mon Sep 17 00:00:00 2001 From: Mikers Date: Sat, 28 Mar 2026 10:55:34 -1000 Subject: [PATCH 35/61] analyzer: surface rewrite plan-empty reasons --- scripts/analyze_vlog_maintenance_capacity.py | 16 +++++++ worklog/2026-03-28.md | 48 ++++++++++++++++++++ 2 files changed, 64 insertions(+) diff --git a/scripts/analyze_vlog_maintenance_capacity.py b/scripts/analyze_vlog_maintenance_capacity.py index 1033aa1ce..f98c0245c 100755 --- a/scripts/analyze_vlog_maintenance_capacity.py +++ b/scripts/analyze_vlog_maintenance_capacity.py @@ -222,7 +222,12 @@ def build_summary(stats: dict[str, Any]) -> dict[str, Any]: "rewrite_plan_runs": metric_int(stats, "treedb.cache.vlog_generation.rewrite.plan_runs"), "rewrite_plan_selected": metric_int(stats, "treedb.cache.vlog_generation.rewrite.plan_selected"), "rewrite_plan_empty": metric_int(stats, "treedb.cache.vlog_generation.rewrite.plan_empty"), + "rewrite_plan_empty_no_selection": metric_int(stats, "treedb.cache.vlog_generation.rewrite.plan_empty.no_selection"), + "rewrite_plan_empty_age_blocked": metric_int(stats, "treedb.cache.vlog_generation.rewrite.plan_empty.age_blocked"), "rewrite_plan_selected_segments_total": metric_int(stats, "treedb.cache.vlog_generation.rewrite.plan_selected_segments_total"), + "rewrite_plan_penalty_filter_runs": metric_int(stats, "treedb.cache.vlog_generation.rewrite.plan_penalty_filter.runs"), + "rewrite_plan_penalty_filter_segments": metric_int(stats, "treedb.cache.vlog_generation.rewrite.plan_penalty_filter.segments"), + "rewrite_plan_penalty_filter_to_empty_runs": metric_int(stats, "treedb.cache.vlog_generation.rewrite.plan_penalty_filter.to_empty_runs"), "rewrite_exec_source_segments_total": metric_int(stats, "treedb.cache.vlog_generation.rewrite.exec.source_segments_total"), "rewrite_exec_source_segments_requested_total": metric_int(stats, "treedb.cache.vlog_generation.rewrite.exec.source_segments_requested_total"), "rewrite_exec_source_segments_still_referenced_total": metric_int(stats, "treedb.cache.vlog_generation.rewrite.exec.source_segments_still_referenced_total"), @@ -457,6 +462,17 @@ def print_report(summary: dict[str, Any], source_file: Path, run_home: str, inst f"{summary['rewrite_plan_runs']} / {summary['rewrite_plan_selected']} / {summary['rewrite_plan_empty']} " f"(select_rate={summary['rewrite_plan_select_rate_pct']:.1f}%)" ) + print( + " plan-empty breakdown: " + f"no_selection={summary['rewrite_plan_empty_no_selection']} " + f"age_blocked={summary['rewrite_plan_empty_age_blocked']}" + ) + print( + " plan penalty-filter: " + f"runs={summary['rewrite_plan_penalty_filter_runs']} " + f"segments={summary['rewrite_plan_penalty_filter_segments']} " + f"to_empty_runs={summary['rewrite_plan_penalty_filter_to_empty_runs']}" + ) print( " selected->executed segments: " f"{summary['rewrite_plan_selected_segments_total']} -> {summary['rewrite_exec_source_segments_total']} " diff --git a/worklog/2026-03-28.md b/worklog/2026-03-28.md index 8ee838c56..4b7fb02ba 100644 --- a/worklog/2026-03-28.md +++ b/worklog/2026-03-28.md @@ -249,3 +249,51 @@ - Takeaway: - pre-checkpoint gating is a first-order driver of run-to-run variance in live rewrite coverage under WAL-off fast runs. - enabling the pre-checkpoint override avoids the catastrophic `rewrite runs=0` failure mode and restores expected live rewrite activity. + +- Additional live sweep (focus: robust lower end-of-run WAL under `fast` + pre-checkpoint rewrite): + - fixed env baseline: + - `TREEDB_OPEN_PROFILE=fast` + - `TREEDB_ENABLE_VLOG_GENERATION_PRECHECKPOINT_REWRITE=1` + - `TREEDB_VLOG_REWRITE_BUDGET_BYTES_PER_SEC=1073741824` + - `FREEZE_REMOTE_HEIGHT_AT_START=1` + - no total-bytes backstop (outlier repro): + - command: + - `... TREEDB_VLOG_REWRITE_TRIGGER_TOTAL_BYTES=0 TREEDB_VLOG_REWRITE_TRIGGER_STALE_RATIO_PPM=100000 ...` + - run home: `/home/mikers/.celestia-app-mainnet-treedb-20260328094006` + - analyzer highlights: + - `rewrite runs=1` + - `selected->executed=3->2` + - `processed_stale=475.49 MiB` + - `skip stage_gate/stage_not_due=7/7` + - end WAL: `4274361669` + - offline rewrite: `4274357573 -> 2093567828` (post `du -sb`: `2093571924`, `gzip -1`: `1749325383`) + - add total-bytes backstop @ `128 MiB`, stale ratio `100k`: + - run homes: + - `/home/mikers/.celestia-app-mainnet-treedb-20260328094856` + - `/home/mikers/.celestia-app-mainnet-treedb-20260328100118` (replicate) + - analyzer highlights: + - run1: `rewrite runs=7`, `selected->executed=7->7`, `processed_stale=1.44 GiB`, `end WAL=3362578071` + - run2: `rewrite runs=6`, `selected->executed=6->6`, `processed_stale=1.27 GiB`, `end WAL=3574791009` + - offline rewrite: + - run1: `3362573975 -> 2116702484` (post `du -sb`: `2116706580`, `gzip -1`: `1767440551`) + - run2: `3574786913 -> 2132053768` (post `du -sb`: `2132057864`, `gzip -1`: `1778930169`) + - total-bytes backstop @ `64 MiB`, stale ratio sweep: + - stale `100k` (`/home/mikers/.celestia-app-mainnet-treedb-20260328101255`): + - `rewrite runs=8`, `selected->executed=8->8`, `processed_stale=1.60 GiB`, `end WAL=3391412031` + - rewrite `3391407935 -> 2156175550` (post `du -sb`: `2156179646`, `gzip -1`: `1793519331`) + - stale `50k` (`/home/mikers/.celestia-app-mainnet-treedb-20260328102614`): + - `rewrite runs=7`, `selected->executed=7->7`, `processed_stale=1.40 GiB`, `end WAL=3569727005` + - rewrite `3569722909 -> 2175068990` (post `du -sb`: `2175073086`, `gzip -1`: `1806440477`) + - stale `10k` (`/home/mikers/.celestia-app-mainnet-treedb-20260328103947`): + - `rewrite runs=9`, `selected->executed=9->9`, `processed_stale=1.77 GiB`, `end WAL=3588198674` + - rewrite `3588194578 -> 2188079023` (post `du -sb`: `2188083119`, `gzip -1`: `1817157727`) + - interpretation: + - adding a nonzero `trigger_total_bytes` backstop prevents the catastrophic low-coverage outlier seen with stale-ratio-only triggering. + - in this window, pushing stale-ratio lower (`100k -> 50k -> 10k`) increases rewrite volume but does **not** improve end-of-run or post-rewrite bytes; it trends worse, consistent with extra rewrite churn without live reclaim. + - best observed point in this sweep: `trigger_total_bytes=128MiB`, `stale_ratio_ppm=100000` (lowest end WAL and best post-rewrite/gzip among these runs). + +- Capacity analyzer output improvement (follow-up): + - `scripts/analyze_vlog_maintenance_capacity.py` now prints: + - `plan-empty breakdown: no_selection / age_blocked` + - `plan penalty-filter: runs / segments / to_empty_runs` + - this helps distinguish threshold-limited empty plans (`no_selection`) from penalty/cooldown suppression. From db46ff7081895781da60a3557caeeee96d54eff9 Mon Sep 17 00:00:00 2001 From: Mikers Date: Sat, 28 Mar 2026 11:01:28 -1000 Subject: [PATCH 36/61] vlog: add observed-source protection mix counters --- TreeDB/caching/db.go | 21 +++++++ .../caching/vlog_generation_scheduler_test.go | 28 +++++++++ scripts/analyze_vlog_maintenance_capacity.py | 58 +++++++++++++++++++ worklog/2026-03-28.md | 14 +++++ 4 files changed, 121 insertions(+) diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go index 575e4694e..ce07ab36d 100644 --- a/TreeDB/caching/db.go +++ b/TreeDB/caching/db.go @@ -5720,10 +5720,17 @@ type DB struct { vlogGenerationObservedGCSourceSegmentsTotal atomic.Uint64 vlogGenerationObservedGCSourceSegmentsEligibleTotal atomic.Uint64 vlogGenerationObservedGCSourceSegmentsDeletedTotal atomic.Uint64 + vlogGenerationObservedGCSourceSegmentsProtectedInUseTotal atomic.Uint64 + vlogGenerationObservedGCSourceSegmentsProtectedRetainedTotal atomic.Uint64 + vlogGenerationObservedGCSourceSegmentsProtectedOverlapTotal atomic.Uint64 + vlogGenerationObservedGCSourceSegmentsProtectedOtherTotal atomic.Uint64 vlogGenerationObservedGCSourceBytesTotal atomic.Int64 vlogGenerationObservedGCSourceBytesEligibleTotal atomic.Int64 vlogGenerationObservedGCSourceBytesDeletedTotal atomic.Int64 + vlogGenerationObservedGCSourceBytesProtectedInUseTotal atomic.Int64 vlogGenerationObservedGCSourceBytesProtectedRetainedTotal atomic.Int64 + vlogGenerationObservedGCSourceBytesProtectedOverlapTotal atomic.Int64 + vlogGenerationObservedGCSourceBytesProtectedOtherTotal atomic.Int64 retainedPruneMu sync.Mutex retainedPruneDone chan struct{} vlogGenerationRemapSuccesses atomic.Uint64 @@ -13190,10 +13197,17 @@ func (db *DB) observeVlogGenerationGCStats(stats backenddb.ValueLogGCStats) { db.vlogGenerationObservedGCSourceSegmentsTotal.Add(uint64(stats.ObservedSourceSegments)) db.vlogGenerationObservedGCSourceSegmentsEligibleTotal.Add(uint64(stats.ObservedSourceSegmentsEligible)) db.vlogGenerationObservedGCSourceSegmentsDeletedTotal.Add(uint64(stats.ObservedSourceSegmentsDeleted)) + db.vlogGenerationObservedGCSourceSegmentsProtectedInUseTotal.Add(uint64(stats.ObservedSourceSegmentsProtectedInUse)) + db.vlogGenerationObservedGCSourceSegmentsProtectedRetainedTotal.Add(uint64(stats.ObservedSourceSegmentsProtectedRetained)) + db.vlogGenerationObservedGCSourceSegmentsProtectedOverlapTotal.Add(uint64(stats.ObservedSourceSegmentsProtectedOverlap)) + db.vlogGenerationObservedGCSourceSegmentsProtectedOtherTotal.Add(uint64(stats.ObservedSourceSegmentsProtectedOther)) db.vlogGenerationObservedGCSourceBytesTotal.Add(stats.ObservedSourceBytes) db.vlogGenerationObservedGCSourceBytesEligibleTotal.Add(stats.ObservedSourceBytesEligible) db.vlogGenerationObservedGCSourceBytesDeletedTotal.Add(stats.ObservedSourceBytesDeleted) + db.vlogGenerationObservedGCSourceBytesProtectedInUseTotal.Add(stats.ObservedSourceBytesProtectedInUse) db.vlogGenerationObservedGCSourceBytesProtectedRetainedTotal.Add(stats.ObservedSourceBytesProtectedRetained) + db.vlogGenerationObservedGCSourceBytesProtectedOverlapTotal.Add(stats.ObservedSourceBytesProtectedOverlap) + db.vlogGenerationObservedGCSourceBytesProtectedOtherTotal.Add(stats.ObservedSourceBytesProtectedOther) } func (db *DB) observeVlogGenerationVacuumExecDuration(d time.Duration) { @@ -20897,10 +20911,17 @@ func (db *DB) Stats() map[string]string { stats["treedb.cache.vlog_generation.observed_gc.source_segments_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceSegmentsTotal.Load()) stats["treedb.cache.vlog_generation.observed_gc.source_segments_eligible_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceSegmentsEligibleTotal.Load()) stats["treedb.cache.vlog_generation.observed_gc.source_segments_deleted_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceSegmentsDeletedTotal.Load()) + stats["treedb.cache.vlog_generation.observed_gc.source_segments_protected_in_use_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceSegmentsProtectedInUseTotal.Load()) + stats["treedb.cache.vlog_generation.observed_gc.source_segments_protected_retained_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceSegmentsProtectedRetainedTotal.Load()) + stats["treedb.cache.vlog_generation.observed_gc.source_segments_protected_overlap_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceSegmentsProtectedOverlapTotal.Load()) + stats["treedb.cache.vlog_generation.observed_gc.source_segments_protected_other_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceSegmentsProtectedOtherTotal.Load()) stats["treedb.cache.vlog_generation.observed_gc.source_bytes_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceBytesTotal.Load()) stats["treedb.cache.vlog_generation.observed_gc.source_bytes_eligible_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceBytesEligibleTotal.Load()) stats["treedb.cache.vlog_generation.observed_gc.source_bytes_deleted_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceBytesDeletedTotal.Load()) + stats["treedb.cache.vlog_generation.observed_gc.source_bytes_protected_in_use_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceBytesProtectedInUseTotal.Load()) stats["treedb.cache.vlog_generation.observed_gc.source_bytes_protected_retained_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceBytesProtectedRetainedTotal.Load()) + stats["treedb.cache.vlog_generation.observed_gc.source_bytes_protected_overlap_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceBytesProtectedOverlapTotal.Load()) + stats["treedb.cache.vlog_generation.observed_gc.source_bytes_protected_other_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceBytesProtectedOtherTotal.Load()) stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_total"] = fmt.Sprintf("%d", db.vlogGenerationRewriteExecSourceSegments.Load()) stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_requested_total"] = fmt.Sprintf("%d", db.vlogGenerationRewriteSourceSegmentsRequestedTotal.Load()) stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_still_referenced_total"] = fmt.Sprintf("%d", db.vlogGenerationRewriteSourceSegmentsStillReferencedTotal.Load()) diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go index 692e6b389..eb7636bc3 100644 --- a/TreeDB/caching/vlog_generation_scheduler_test.go +++ b/TreeDB/caching/vlog_generation_scheduler_test.go @@ -6271,10 +6271,17 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) { db.vlogGenerationObservedGCSourceSegmentsTotal.Store(11) db.vlogGenerationObservedGCSourceSegmentsEligibleTotal.Store(5) db.vlogGenerationObservedGCSourceSegmentsDeletedTotal.Store(3) + db.vlogGenerationObservedGCSourceSegmentsProtectedInUseTotal.Store(1) + db.vlogGenerationObservedGCSourceSegmentsProtectedRetainedTotal.Store(2) + db.vlogGenerationObservedGCSourceSegmentsProtectedOverlapTotal.Store(3) + db.vlogGenerationObservedGCSourceSegmentsProtectedOtherTotal.Store(4) db.vlogGenerationObservedGCSourceBytesTotal.Store(1100) db.vlogGenerationObservedGCSourceBytesEligibleTotal.Store(500) db.vlogGenerationObservedGCSourceBytesDeletedTotal.Store(300) + db.vlogGenerationObservedGCSourceBytesProtectedInUseTotal.Store(50) db.vlogGenerationObservedGCSourceBytesProtectedRetainedTotal.Store(250) + db.vlogGenerationObservedGCSourceBytesProtectedOverlapTotal.Store(75) + db.vlogGenerationObservedGCSourceBytesProtectedOtherTotal.Store(25) db.vlogGenerationRewriteQueueMu.Lock() db.vlogGenerationRewriteQueueLoaded = true @@ -6567,6 +6574,18 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) { if got := stats["treedb.cache.vlog_generation.observed_gc.source_segments_deleted_total"]; got != "3" { t.Fatalf("observed gc source segments deleted total=%q want 3", got) } + if got := stats["treedb.cache.vlog_generation.observed_gc.source_segments_protected_in_use_total"]; got != "1" { + t.Fatalf("observed gc source segments protected in-use total=%q want 1", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.source_segments_protected_retained_total"]; got != "2" { + t.Fatalf("observed gc source segments protected retained total=%q want 2", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.source_segments_protected_overlap_total"]; got != "3" { + t.Fatalf("observed gc source segments protected overlap total=%q want 3", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.source_segments_protected_other_total"]; got != "4" { + t.Fatalf("observed gc source segments protected other total=%q want 4", got) + } if got := stats["treedb.cache.vlog_generation.observed_gc.source_bytes_total"]; got != "1100" { t.Fatalf("observed gc source bytes total=%q want 1100", got) } @@ -6576,7 +6595,16 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) { if got := stats["treedb.cache.vlog_generation.observed_gc.source_bytes_deleted_total"]; got != "300" { t.Fatalf("observed gc source bytes deleted total=%q want 300", got) } + if got := stats["treedb.cache.vlog_generation.observed_gc.source_bytes_protected_in_use_total"]; got != "50" { + t.Fatalf("observed gc source bytes protected in-use total=%q want 50", got) + } if got := stats["treedb.cache.vlog_generation.observed_gc.source_bytes_protected_retained_total"]; got != "250" { t.Fatalf("observed gc source bytes protected retained total=%q want 250", got) } + if got := stats["treedb.cache.vlog_generation.observed_gc.source_bytes_protected_overlap_total"]; got != "75" { + t.Fatalf("observed gc source bytes protected overlap total=%q want 75", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.source_bytes_protected_other_total"]; got != "25" { + t.Fatalf("observed gc source bytes protected other total=%q want 25", got) + } } diff --git a/scripts/analyze_vlog_maintenance_capacity.py b/scripts/analyze_vlog_maintenance_capacity.py index f98c0245c..a719f5cff 100755 --- a/scripts/analyze_vlog_maintenance_capacity.py +++ b/scripts/analyze_vlog_maintenance_capacity.py @@ -300,10 +300,17 @@ def build_summary(stats: dict[str, Any]) -> dict[str, Any]: "observed_gc_source_segments_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_segments_total"), "observed_gc_source_segments_eligible_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_segments_eligible_total"), "observed_gc_source_segments_deleted_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_segments_deleted_total"), + "observed_gc_source_segments_protected_in_use_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_segments_protected_in_use_total"), + "observed_gc_source_segments_protected_retained_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_segments_protected_retained_total"), + "observed_gc_source_segments_protected_overlap_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_segments_protected_overlap_total"), + "observed_gc_source_segments_protected_other_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_segments_protected_other_total"), "observed_gc_source_bytes_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_bytes_total"), "observed_gc_source_bytes_eligible_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_bytes_eligible_total"), "observed_gc_source_bytes_deleted_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_bytes_deleted_total"), + "observed_gc_source_bytes_protected_in_use_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_bytes_protected_in_use_total"), "observed_gc_source_bytes_protected_retained_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_bytes_protected_retained_total"), + "observed_gc_source_bytes_protected_overlap_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_bytes_protected_overlap_total"), + "observed_gc_source_bytes_protected_other_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_bytes_protected_other_total"), "checkpoint_kick_runs": metric_int(stats, "treedb.cache.vlog_generation.checkpoint_kick.runs"), "checkpoint_kick_gc_runs": metric_int(stats, "treedb.cache.vlog_generation.checkpoint_kick.gc_runs"), "checkpoint_kick_rewrite_runs": metric_int(stats, "treedb.cache.vlog_generation.checkpoint_kick.rewrite_runs"), @@ -388,6 +395,38 @@ def build_summary(stats: dict[str, Any]) -> dict[str, Any]: m["observed_gc_source_bytes_deleted_total"], m["observed_gc_source_bytes_eligible_total"], ) + m["observed_gc_source_segments_protected_in_use_pct"] = pct( + m["observed_gc_source_segments_protected_in_use_total"], + m["observed_gc_source_segments_total"], + ) + m["observed_gc_source_segments_protected_retained_pct"] = pct( + m["observed_gc_source_segments_protected_retained_total"], + m["observed_gc_source_segments_total"], + ) + m["observed_gc_source_segments_protected_overlap_pct"] = pct( + m["observed_gc_source_segments_protected_overlap_total"], + m["observed_gc_source_segments_total"], + ) + m["observed_gc_source_segments_protected_other_pct"] = pct( + m["observed_gc_source_segments_protected_other_total"], + m["observed_gc_source_segments_total"], + ) + m["observed_gc_source_bytes_protected_in_use_pct"] = pct( + m["observed_gc_source_bytes_protected_in_use_total"], + m["observed_gc_source_bytes_total"], + ) + m["observed_gc_source_bytes_protected_retained_pct"] = pct( + m["observed_gc_source_bytes_protected_retained_total"], + m["observed_gc_source_bytes_total"], + ) + m["observed_gc_source_bytes_protected_overlap_pct"] = pct( + m["observed_gc_source_bytes_protected_overlap_total"], + m["observed_gc_source_bytes_total"], + ) + m["observed_gc_source_bytes_protected_other_pct"] = pct( + m["observed_gc_source_bytes_protected_other_total"], + m["observed_gc_source_bytes_total"], + ) m["retained_prune_removed_candidate_segments_pct"] = pct( m["retained_prune_removed_segments"], m["retained_prune_candidate_segments"], @@ -581,6 +620,25 @@ def print_report(summary: dict[str, Any], source_file: Path, run_home: str, inst f"deleted_pct={summary['observed_gc_source_bytes_deleted_pct']:.1f}%, " f"deleted_of_eligible={summary['observed_gc_source_bytes_deleted_of_eligible_pct']:.1f}%)" ) + print( + " observed-source protection mix: " + f"segments in_use={summary['observed_gc_source_segments_protected_in_use_total']} " + f"retained={summary['observed_gc_source_segments_protected_retained_total']} " + f"overlap={summary['observed_gc_source_segments_protected_overlap_total']} " + f"other={summary['observed_gc_source_segments_protected_other_total']} " + f"(in_use={summary['observed_gc_source_segments_protected_in_use_pct']:.1f}%, " + f"retained={summary['observed_gc_source_segments_protected_retained_pct']:.1f}%, " + f"overlap={summary['observed_gc_source_segments_protected_overlap_pct']:.1f}%, " + f"other={summary['observed_gc_source_segments_protected_other_pct']:.1f}%) " + f"bytes in_use={human_bytes(summary['observed_gc_source_bytes_protected_in_use_total'])} " + f"retained={human_bytes(summary['observed_gc_source_bytes_protected_retained_total'])} " + f"overlap={human_bytes(summary['observed_gc_source_bytes_protected_overlap_total'])} " + f"other={human_bytes(summary['observed_gc_source_bytes_protected_other_total'])} " + f"(in_use={summary['observed_gc_source_bytes_protected_in_use_pct']:.1f}%, " + f"retained={summary['observed_gc_source_bytes_protected_retained_pct']:.1f}%, " + f"overlap={summary['observed_gc_source_bytes_protected_overlap_pct']:.1f}%, " + f"other={summary['observed_gc_source_bytes_protected_other_pct']:.1f}%)" + ) print( " observed-source retained-prune totals: " f"seen={summary['retained_prune_observed_source_segments_total']} ({human_bytes(summary['retained_prune_observed_source_bytes_total'])}) " diff --git a/worklog/2026-03-28.md b/worklog/2026-03-28.md index 4b7fb02ba..fa3ec5f10 100644 --- a/worklog/2026-03-28.md +++ b/worklog/2026-03-28.md @@ -297,3 +297,17 @@ - `plan-empty breakdown: no_selection / age_blocked` - `plan penalty-filter: runs / segments / to_empty_runs` - this helps distinguish threshold-limited empty plans (`no_selection`) from penalty/cooldown suppression. + +- Observability extension for observed-source GC protection breakdown: + - Added cumulative stats counters in `TreeDB/caching/db.go`: + - `treedb.cache.vlog_generation.observed_gc.source_segments_protected_in_use_total` + - `treedb.cache.vlog_generation.observed_gc.source_segments_protected_retained_total` + - `treedb.cache.vlog_generation.observed_gc.source_segments_protected_overlap_total` + - `treedb.cache.vlog_generation.observed_gc.source_segments_protected_other_total` + - `treedb.cache.vlog_generation.observed_gc.source_bytes_protected_in_use_total` + - `treedb.cache.vlog_generation.observed_gc.source_bytes_protected_retained_total` + - `treedb.cache.vlog_generation.observed_gc.source_bytes_protected_overlap_total` + - `treedb.cache.vlog_generation.observed_gc.source_bytes_protected_other_total` + - Extended `scripts/analyze_vlog_maintenance_capacity.py` to report observed-source protection mix (segments + bytes + percentages). + - Updated stats test coverage: + - `TreeDB/caching/vlog_generation_scheduler_test.go` (`TestVlogGenerationStats_ReportRewriteBacklogAndDurations`). From 6cc124c4c000cab383e33d77f1921f41be06beec Mon Sep 17 00:00:00 2001 From: Mikers Date: Sat, 28 Mar 2026 11:16:13 -1000 Subject: [PATCH 37/61] worklog: capture protection-mix validation run --- worklog/2026-03-28.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/worklog/2026-03-28.md b/worklog/2026-03-28.md index fa3ec5f10..5db2ed86a 100644 --- a/worklog/2026-03-28.md +++ b/worklog/2026-03-28.md @@ -311,3 +311,20 @@ - Extended `scripts/analyze_vlog_maintenance_capacity.py` to report observed-source protection mix (segments + bytes + percentages). - Updated stats test coverage: - `TreeDB/caching/vlog_generation_scheduler_test.go` (`TestVlogGenerationStats_ReportRewriteBacklogAndDurations`). + +- Validation run using new protection-mix counters (best current config): + - command: + - `LOCAL_GOMAP_DIR=/home/mikers/dev/snissn/gomap-phasehook-active TREEDB_OPEN_PROFILE=fast TREEDB_ENABLE_VLOG_GENERATION_PRECHECKPOINT_REWRITE=1 TREEDB_VLOG_REWRITE_BUDGET_BYTES_PER_SEC=1073741824 TREEDB_VLOG_REWRITE_TRIGGER_TOTAL_BYTES=134217728 TREEDB_VLOG_REWRITE_TRIGGER_STALE_RATIO_PPM=100000 FREEZE_REMOTE_HEIGHT_AT_START=1 ~/run_celestia.sh` + - run home: + - `/home/mikers/.celestia-app-mainnet-treedb-20260328110211` + - analyzer highlights: + - `rewrite runs=8`, `selected->executed=9->8`, `processed_stale=1.60 GiB` + - `plan-empty breakdown: no_selection=6 age_blocked=5` + - observed-source protection mix: + - segments: `in_use=0 retained=23 overlap=0 other=0` + - bytes: `in_use=0 B retained=5.75 GiB overlap=0 B other=0 B` + - size: + - end WAL: `3639153423` + - offline rewrite: `3639149327 -> 2230505477` (post `du -sb`: `2230509573`, `gzip -1`: `1848452954`) + - interpretation: + - in this run, observed-source protection is entirely `retained` (not `in_use` or overlap), confirming retained-lifecycle protection as the dominant in-run reclaim blocker. From 71be1df8d373487fcb3adef072a48adce283da6a Mon Sep 17 00:00:00 2001 From: Mikers Date: Sat, 28 Mar 2026 12:09:18 -1000 Subject: [PATCH 38/61] vlog: add observed-source retry budget and celestia a/b harness --- TreeDB/caching/db.go | 167 +++++- .../caching/vlog_generation_scheduler_test.go | 159 ++++++ docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md | 41 ++ scripts/analyze_vlog_maintenance_capacity.py | 24 + scripts/run_celestia_ab.sh | 510 ++++++++++++++++++ 5 files changed, 892 insertions(+), 9 deletions(-) create mode 100755 scripts/run_celestia_ab.sh diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go index ce07ab36d..10ff71d72 100644 --- a/TreeDB/caching/db.go +++ b/TreeDB/caching/db.go @@ -4699,10 +4699,14 @@ func (db *DB) queueVlogGenerationObservedSourceGCList(ids []uint32) { if db == nil || len(ids) == 0 { return } + nowUnixNano := time.Now().UnixNano() db.vlogGenerationObservedGCMu.Lock() if db.vlogGenerationObservedGCSourceIDs == nil { db.vlogGenerationObservedGCSourceIDs = make(map[uint32]struct{}, len(ids)) } + if db.vlogGenerationObservedGCFirstQueuedUnixNano == nil { + db.vlogGenerationObservedGCFirstQueuedUnixNano = make(map[uint32]int64, len(ids)) + } added := 0 for _, id := range ids { if id == 0 { @@ -4712,6 +4716,9 @@ func (db *DB) queueVlogGenerationObservedSourceGCList(ids []uint32) { continue } db.vlogGenerationObservedGCSourceIDs[id] = struct{}{} + if _, exists := db.vlogGenerationObservedGCFirstQueuedUnixNano[id]; !exists { + db.vlogGenerationObservedGCFirstQueuedUnixNano[id] = nowUnixNano + } added++ } db.vlogGenerationObservedGCMu.Unlock() @@ -4725,10 +4732,14 @@ func (db *DB) queueVlogGenerationObservedSourceGCIDs(ids map[uint32]struct{}) { if db == nil || len(ids) == 0 { return } + nowUnixNano := time.Now().UnixNano() db.vlogGenerationObservedGCMu.Lock() if db.vlogGenerationObservedGCSourceIDs == nil { db.vlogGenerationObservedGCSourceIDs = make(map[uint32]struct{}, len(ids)) } + if db.vlogGenerationObservedGCFirstQueuedUnixNano == nil { + db.vlogGenerationObservedGCFirstQueuedUnixNano = make(map[uint32]int64, len(ids)) + } added := 0 for id := range ids { if id == 0 { @@ -4738,6 +4749,9 @@ func (db *DB) queueVlogGenerationObservedSourceGCIDs(ids map[uint32]struct{}) { continue } db.vlogGenerationObservedGCSourceIDs[id] = struct{}{} + if _, exists := db.vlogGenerationObservedGCFirstQueuedUnixNano[id]; !exists { + db.vlogGenerationObservedGCFirstQueuedUnixNano[id] = nowUnixNano + } added++ } db.vlogGenerationObservedGCMu.Unlock() @@ -4772,6 +4786,92 @@ func (db *DB) takeVlogGenerationObservedSourceGCList() []uint32 { return out } +func (db *DB) finalizeVlogGenerationObservedSourceGCIDs(ids []uint32, dropped bool) { + if db == nil || len(ids) == 0 { + return + } + nowUnixNano := time.Now().UnixNano() + totalLatencyMS := uint64(0) + maxLatencyMS := uint64(0) + finalized := 0 + seen := make(map[uint32]struct{}, len(ids)) + db.vlogGenerationObservedGCMu.Lock() + for _, id := range ids { + if id == 0 { + continue + } + if _, exists := seen[id]; exists { + continue + } + seen[id] = struct{}{} + finalized++ + delete(db.vlogGenerationObservedGCRetryAttempts, id) + if startUnixNano, exists := db.vlogGenerationObservedGCFirstQueuedUnixNano[id]; exists { + delete(db.vlogGenerationObservedGCFirstQueuedUnixNano, id) + if startUnixNano > 0 && nowUnixNano > startUnixNano { + latencyMS := uint64((nowUnixNano - startUnixNano) / int64(time.Millisecond)) + totalLatencyMS += latencyMS + if latencyMS > maxLatencyMS { + maxLatencyMS = latencyMS + } + } + } + } + db.vlogGenerationObservedGCMu.Unlock() + if finalized == 0 { + return + } + if dropped { + db.vlogGenerationObservedGCLatencyDroppedIDs.Add(uint64(finalized)) + } else { + db.vlogGenerationObservedGCLatencyCompletedIDs.Add(uint64(finalized)) + } + if totalLatencyMS > 0 { + db.vlogGenerationObservedGCLatencyTotalMS.Add(totalLatencyMS) + updateAtomicMaxUint64(&db.vlogGenerationObservedGCLatencyMaxMS, maxLatencyMS) + } +} + +func (db *DB) retryVlogGenerationObservedSourceGCList(ids []uint32) (queuedIDs, droppedIDs int) { + if db == nil || len(ids) == 0 { + return 0, 0 + } + retry := make([]uint32, 0, len(ids)) + dropped := make([]uint32, 0, len(ids)) + seen := make(map[uint32]struct{}, len(ids)) + db.vlogGenerationObservedGCMu.Lock() + if db.vlogGenerationObservedGCRetryAttempts == nil { + db.vlogGenerationObservedGCRetryAttempts = make(map[uint32]uint8, len(ids)) + } + for _, id := range ids { + if id == 0 { + continue + } + if _, exists := seen[id]; exists { + continue + } + seen[id] = struct{}{} + attempts := db.vlogGenerationObservedGCRetryAttempts[id] + if attempts >= vlogGenerationObservedGCRetryMaxAttempts { + delete(db.vlogGenerationObservedGCRetryAttempts, id) + dropped = append(dropped, id) + continue + } + db.vlogGenerationObservedGCRetryAttempts[id] = attempts + 1 + retry = append(retry, id) + } + db.vlogGenerationObservedGCMu.Unlock() + if len(retry) > 0 { + db.vlogGenerationObservedGCRetryQueued.Add(1) + db.queueVlogGenerationObservedSourceGCList(retry) + } + if len(dropped) > 0 { + db.vlogGenerationObservedGCRetryDropped.Add(uint64(len(dropped))) + db.finalizeVlogGenerationObservedSourceGCIDs(dropped, true) + } + return len(retry), len(dropped) +} + func (db *DB) scheduleRetainedValueLogPrune() { db.scheduleRetainedValueLogPruneWithForce(false) } @@ -5717,6 +5817,13 @@ type DB struct { vlogGenerationObservedGCTakenIDs atomic.Uint64 vlogGenerationObservedGCRuns atomic.Uint64 vlogGenerationObservedGCRetryQueued atomic.Uint64 + vlogGenerationObservedGCRetryDropped atomic.Uint64 + vlogGenerationObservedGCRetryAttempts map[uint32]uint8 + vlogGenerationObservedGCFirstQueuedUnixNano map[uint32]int64 + vlogGenerationObservedGCLatencyCompletedIDs atomic.Uint64 + vlogGenerationObservedGCLatencyDroppedIDs atomic.Uint64 + vlogGenerationObservedGCLatencyTotalMS atomic.Uint64 + vlogGenerationObservedGCLatencyMaxMS atomic.Uint64 vlogGenerationObservedGCSourceSegmentsTotal atomic.Uint64 vlogGenerationObservedGCSourceSegmentsEligibleTotal atomic.Uint64 vlogGenerationObservedGCSourceSegmentsDeletedTotal atomic.Uint64 @@ -6019,6 +6126,9 @@ const ( // requests while replay GC is trying to converge. Allow a faster cadence for // that targeted path without dropping the generic min-interval guard. retainedPruneObservedMinInterval = 3 * time.Second + // Bound observed-source replay retries so a permanently retained-protected ID + // cannot stay queued forever when replay GC cannot make progress. + vlogGenerationObservedGCRetryMaxAttempts = uint8(3) // Coordinate index vacuum with major rewrite windows; do not run on every GC. vlogGenerationVacuumTriggerRewriteBytes = int64(64 << 20) vlogGenerationVacuumMinInterval = 5 * time.Minute @@ -15018,8 +15128,13 @@ planned: len(observedSourceGCIDs), ) if forceObservedSourceGC { - db.vlogGenerationObservedGCRetryQueued.Add(1) - db.queueVlogGenerationObservedSourceGCList(observedSourceGCIDs) + queuedIDs, droppedIDs := db.retryVlogGenerationObservedSourceGCList(observedSourceGCIDs) + db.debugVlogMaintf( + "gc_observed_retry reason=disabled_env observed_ids=%d queued_ids=%d dropped_ids=%d", + len(observedSourceGCIDs), + queuedIDs, + droppedIDs, + ) } return } @@ -15045,8 +15160,13 @@ planned: len(observedSourceGCIDs), ) if forceObservedSourceGC { - db.vlogGenerationObservedGCRetryQueued.Add(1) - db.queueVlogGenerationObservedSourceGCList(observedSourceGCIDs) + queuedIDs, droppedIDs := db.retryVlogGenerationObservedSourceGCList(observedSourceGCIDs) + db.debugVlogMaintf( + "gc_observed_retry reason=backend_no_gcer observed_ids=%d queued_ids=%d dropped_ids=%d", + len(observedSourceGCIDs), + queuedIDs, + droppedIDs, + ) } return } @@ -15154,9 +15274,19 @@ planned: ) db.queueRetainedPruneObservedSourceIDs(observedSourceGCIDs) db.scheduleRetainedValueLogPruneForce() - db.vlogGenerationObservedGCRetryQueued.Add(1) - db.queueVlogGenerationObservedSourceGCList(observedSourceGCIDs) - db.vlogGenerationCheckpointKickPending.Store(true) + queuedIDs, droppedIDs := db.retryVlogGenerationObservedSourceGCList(observedSourceGCIDs) + if queuedIDs > 0 { + db.vlogGenerationCheckpointKickPending.Store(true) + } + db.debugVlogMaintf( + "gc_observed_retry_result reason=retained_protected observed_ids=%d queued_ids=%d dropped_ids=%d max_attempts=%d", + len(observedSourceGCIDs), + queuedIDs, + droppedIDs, + vlogGenerationObservedGCRetryMaxAttempts, + ) + } else if forceObservedSourceGC { + db.finalizeVlogGenerationObservedSourceGCIDs(observedSourceGCIDs, false) } db.vlogGenerationSchedulerState.Store(vlogGenerationSchedulerIdle) db.vlogGenerationGCRuns.Add(1) @@ -15177,8 +15307,13 @@ planned: err, ) if forceObservedSourceGC { - db.vlogGenerationObservedGCRetryQueued.Add(1) - db.queueVlogGenerationObservedSourceGCList(observedSourceGCIDs) + queuedIDs, droppedIDs := db.retryVlogGenerationObservedSourceGCList(observedSourceGCIDs) + db.debugVlogMaintf( + "gc_observed_retry reason=gc_error observed_ids=%d queued_ids=%d dropped_ids=%d", + len(observedSourceGCIDs), + queuedIDs, + droppedIDs, + ) } if errors.Is(err, context.Canceled) { db.vlogGenerationSchedulerState.Store(vlogGenerationSchedulerIdle) @@ -20701,6 +20836,13 @@ func (db *DB) Stats() map[string]string { db.vlogGenerationObservedGCMu.Lock() observedGCPending := len(db.vlogGenerationObservedGCSourceIDs) db.vlogGenerationObservedGCMu.Unlock() + observedGCLatencyCompleted := db.vlogGenerationObservedGCLatencyCompletedIDs.Load() + observedGCLatencyDropped := db.vlogGenerationObservedGCLatencyDroppedIDs.Load() + observedGCLatencyTotalMS := db.vlogGenerationObservedGCLatencyTotalMS.Load() + observedGCLatencyAvgMS := 0.0 + if totalObservedGCLatencyIDs := observedGCLatencyCompleted + observedGCLatencyDropped; totalObservedGCLatencyIDs > 0 { + observedGCLatencyAvgMS = float64(observedGCLatencyTotalMS) / float64(totalObservedGCLatencyIDs) + } rewriteAgeBlockedUntilNS := db.vlogGenerationRewriteAgeBlockedUntilNS.Load() rewriteAgeBlockedRemainingMS := int64(0) if rewriteAgeBlockedUntilNS > 0 { @@ -20908,6 +21050,13 @@ func (db *DB) Stats() map[string]string { stats["treedb.cache.vlog_generation.observed_gc.taken_ids"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCTakenIDs.Load()) stats["treedb.cache.vlog_generation.observed_gc.runs"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCRuns.Load()) stats["treedb.cache.vlog_generation.observed_gc.retry_queued"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCRetryQueued.Load()) + stats["treedb.cache.vlog_generation.observed_gc.retry_dropped"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCRetryDropped.Load()) + stats["treedb.cache.vlog_generation.observed_gc.retry_max_attempts"] = fmt.Sprintf("%d", vlogGenerationObservedGCRetryMaxAttempts) + stats["treedb.cache.vlog_generation.observed_gc.latency.completed_ids"] = fmt.Sprintf("%d", observedGCLatencyCompleted) + stats["treedb.cache.vlog_generation.observed_gc.latency.dropped_ids"] = fmt.Sprintf("%d", observedGCLatencyDropped) + stats["treedb.cache.vlog_generation.observed_gc.latency.total_ms"] = fmt.Sprintf("%d", observedGCLatencyTotalMS) + stats["treedb.cache.vlog_generation.observed_gc.latency.max_ms"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCLatencyMaxMS.Load()) + stats["treedb.cache.vlog_generation.observed_gc.latency.avg_ms"] = fmt.Sprintf("%.3f", observedGCLatencyAvgMS) stats["treedb.cache.vlog_generation.observed_gc.source_segments_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceSegmentsTotal.Load()) stats["treedb.cache.vlog_generation.observed_gc.source_segments_eligible_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceSegmentsEligibleTotal.Load()) stats["treedb.cache.vlog_generation.observed_gc.source_segments_deleted_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceSegmentsDeletedTotal.Load()) diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go index eb7636bc3..1ee40dd58 100644 --- a/TreeDB/caching/vlog_generation_scheduler_test.go +++ b/TreeDB/caching/vlog_generation_scheduler_test.go @@ -1298,6 +1298,139 @@ func TestVlogGenerationMaintenance_ObservedSourceGCBypassQuietIgnoresForegroundR } } +func TestVlogGenerationMaintenance_ObservedSourceGCCompletionClearsRetryState(t *testing.T) { + prepareDirectSchedulerTest(t) + t.Setenv(envDisableVlogGenerationRewrite, "1") + + dir := t.TempDir() + backend, err := backenddb.Open(backenddb.Options{Dir: dir}) + if err != nil { + t.Fatalf("open backend: %v", err) + } + recorder := &rewriteBudgetRecordingBackend{ + DB: backend, + gcResponse: backenddb.ValueLogGCStats{ + ObservedSourceSegments: 1, + ObservedSourceSegmentsEligible: 1, + ObservedSourceSegmentsDeleted: 1, + ObservedSourceBytes: 256, + ObservedSourceBytesEligible: 256, + ObservedSourceBytesDeleted: 256, + }, + } + + db, cleanup := openRewriteQueueTestDB(t, dir, recorder) + defer cleanup() + skipRetainedPrune(db) + + db.queueVlogGenerationObservedSourceGCList([]uint32{41}) + db.vlogGenerationLastGCUnixNano.Store(time.Now().Add(-time.Minute).UnixNano()) + forceVlogMaintenanceIdle(db) + + db.maybeRunVlogGenerationMaintenanceWithOptions(true, vlogGenerationMaintenanceOptions{ + bypassQuiet: true, + skipRetainedPruneWait: true, + skipCheckpoint: true, + rewriteDebtDrain: true, + }) + + if got := recorder.recordedGCObservedSourceCalls(); got != 1 { + t.Fatalf("observed-source gc calls=%d want 1", got) + } + if got := db.vlogGenerationObservedGCRetryQueued.Load(); got != 0 { + t.Fatalf("observed-source gc retry queued=%d want 0", got) + } + if got := db.vlogGenerationObservedGCRetryDropped.Load(); got != 0 { + t.Fatalf("observed-source gc retry dropped=%d want 0", got) + } + if got := db.vlogGenerationObservedGCLatencyCompletedIDs.Load(); got != 1 { + t.Fatalf("observed-source gc latency completed ids=%d want 1", got) + } + if got := db.vlogGenerationObservedGCLatencyDroppedIDs.Load(); got != 0 { + t.Fatalf("observed-source gc latency dropped ids=%d want 0", got) + } + if pending := len(db.takeVlogGenerationObservedSourceGCList()); pending != 0 { + t.Fatalf("observed-source gc pending ids=%d want 0", pending) + } + db.vlogGenerationObservedGCMu.Lock() + if _, exists := db.vlogGenerationObservedGCRetryAttempts[41]; exists { + db.vlogGenerationObservedGCMu.Unlock() + t.Fatalf("retry attempt state still present for observed id 41") + } + if _, exists := db.vlogGenerationObservedGCFirstQueuedUnixNano[41]; exists { + db.vlogGenerationObservedGCMu.Unlock() + t.Fatalf("first queued timestamp still present for observed id 41") + } + db.vlogGenerationObservedGCMu.Unlock() +} + +func TestVlogGenerationMaintenance_ObservedSourceGCRetryBudgetDropsAfterMaxAttempts(t *testing.T) { + prepareDirectSchedulerTest(t) + t.Setenv(envDisableVlogGenerationRewrite, "1") + + dir := t.TempDir() + backend, err := backenddb.Open(backenddb.Options{Dir: dir}) + if err != nil { + t.Fatalf("open backend: %v", err) + } + recorder := &rewriteBudgetRecordingBackend{ + DB: backend, + gcResponse: backenddb.ValueLogGCStats{ + ObservedSourceSegments: 1, + ObservedSourceSegmentsEligible: 0, + ObservedSourceSegmentsProtectedRetained: 1, + ObservedSourceBytes: 128, + ObservedSourceBytesProtectedRetained: 128, + }, + } + + db, cleanup := openRewriteQueueTestDB(t, dir, recorder) + defer cleanup() + skipRetainedPrune(db) + + db.queueVlogGenerationObservedSourceGCList([]uint32{73}) + passes := int(vlogGenerationObservedGCRetryMaxAttempts) + 1 + for i := 0; i < passes; i++ { + db.vlogGenerationLastGCUnixNano.Store(time.Now().Add(-time.Minute).UnixNano()) + forceVlogMaintenanceIdle(db) + db.maybeRunVlogGenerationMaintenanceWithOptions(true, vlogGenerationMaintenanceOptions{ + bypassQuiet: true, + skipRetainedPruneWait: true, + skipCheckpoint: true, + rewriteDebtDrain: true, + }) + } + + if got := recorder.recordedGCObservedSourceCalls(); got != passes { + t.Fatalf("observed-source gc calls=%d want %d", got, passes) + } + if got := db.vlogGenerationObservedGCRetryQueued.Load(); got != uint64(vlogGenerationObservedGCRetryMaxAttempts) { + t.Fatalf("observed-source gc retry queued=%d want %d", got, vlogGenerationObservedGCRetryMaxAttempts) + } + if got := db.vlogGenerationObservedGCRetryDropped.Load(); got != 1 { + t.Fatalf("observed-source gc retry dropped=%d want 1", got) + } + if got := db.vlogGenerationObservedGCLatencyCompletedIDs.Load(); got != 0 { + t.Fatalf("observed-source gc latency completed ids=%d want 0", got) + } + if got := db.vlogGenerationObservedGCLatencyDroppedIDs.Load(); got != 1 { + t.Fatalf("observed-source gc latency dropped ids=%d want 1", got) + } + if pending := len(db.takeVlogGenerationObservedSourceGCList()); pending != 0 { + t.Fatalf("observed-source gc pending ids=%d want 0", pending) + } + db.vlogGenerationObservedGCMu.Lock() + if _, exists := db.vlogGenerationObservedGCRetryAttempts[73]; exists { + db.vlogGenerationObservedGCMu.Unlock() + t.Fatalf("retry attempt state still present for observed id 73 after drop") + } + if _, exists := db.vlogGenerationObservedGCFirstQueuedUnixNano[73]; exists { + db.vlogGenerationObservedGCMu.Unlock() + t.Fatalf("first queued timestamp still present for observed id 73 after drop") + } + db.vlogGenerationObservedGCMu.Unlock() +} + func TestVlogGenerationRewrite_FreshPlanExecIgnoresForegroundCancelUntilBoundedComplete(t *testing.T) { prepareDirectSchedulerTest(t) @@ -6268,6 +6401,11 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) { db.vlogGenerationObservedGCTakenIDs.Store(9) db.vlogGenerationObservedGCRuns.Store(3) db.vlogGenerationObservedGCRetryQueued.Store(2) + db.vlogGenerationObservedGCRetryDropped.Store(1) + db.vlogGenerationObservedGCLatencyCompletedIDs.Store(6) + db.vlogGenerationObservedGCLatencyDroppedIDs.Store(2) + db.vlogGenerationObservedGCLatencyTotalMS.Store(640) + db.vlogGenerationObservedGCLatencyMaxMS.Store(210) db.vlogGenerationObservedGCSourceSegmentsTotal.Store(11) db.vlogGenerationObservedGCSourceSegmentsEligibleTotal.Store(5) db.vlogGenerationObservedGCSourceSegmentsDeletedTotal.Store(3) @@ -6565,6 +6703,27 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) { if got := stats["treedb.cache.vlog_generation.observed_gc.retry_queued"]; got != "2" { t.Fatalf("observed gc retry queued=%q want 2", got) } + if got := stats["treedb.cache.vlog_generation.observed_gc.retry_dropped"]; got != "1" { + t.Fatalf("observed gc retry dropped=%q want 1", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.retry_max_attempts"]; got != "3" { + t.Fatalf("observed gc retry max attempts=%q want 3", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.latency.completed_ids"]; got != "6" { + t.Fatalf("observed gc latency completed ids=%q want 6", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.latency.dropped_ids"]; got != "2" { + t.Fatalf("observed gc latency dropped ids=%q want 2", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.latency.total_ms"]; got != "640" { + t.Fatalf("observed gc latency total ms=%q want 640", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.latency.max_ms"]; got != "210" { + t.Fatalf("observed gc latency max ms=%q want 210", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.latency.avg_ms"]; got != "80.000" { + t.Fatalf("observed gc latency avg ms=%q want 80.000", got) + } if got := stats["treedb.cache.vlog_generation.observed_gc.source_segments_total"]; got != "11" { t.Fatalf("observed gc source segments total=%q want 11", got) } diff --git a/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md b/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md index 113bebed3..6f5d83aa0 100644 --- a/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md +++ b/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md @@ -56,6 +56,47 @@ The report highlights: - zombie inventory (pinned vs unpinned bytes) - GC eligibility/protection signals +## Interleaved A/B Harness +For sync+rewrite tradeoff validation, use the interleaved harness: + +```bash +cat >/tmp/cel_control.env <<'EOF' +LOCAL_GOMAP_DIR=/path/to/control/gomap +TREEDB_OPEN_PROFILE=fast +EOF + +cat >/tmp/cel_candidate.env <<'EOF' +LOCAL_GOMAP_DIR=/path/to/candidate/gomap +TREEDB_OPEN_PROFILE=fast +EOF + +CONTROL_ENV_FILE=/tmp/cel_control.env \ +CANDIDATE_ENV_FILE=/tmp/cel_candidate.env \ +MAX_PAIRS=10 \ +MIN_PAIRS=4 \ +CLEAR_WIN_PAIRS=3 \ +CLEAR_LOSS_PAIRS=3 \ +./scripts/run_celestia_ab.sh +``` + +Default pair metric focus: +- `T_sync`: sync duration (seconds) +- `S_sync_app`: app dir bytes at sync end +- `S_sync_wal`: `application.db/maindb/wal` bytes at sync end +- `T_rw`: offline `vlog-rewrite` wall time +- `S_post_wal`: WAL bytes after offline rewrite +- `T_total = T_sync + T_rw` +- `max_rss_kb` (memory guardrail) + +Outputs: +- `artifacts/celestia_ab//runs.csv` +- `artifacts/celestia_ab//pairs.csv` +- `artifacts/celestia_ab//summary.md` +- per-run JSON under `artifacts/celestia_ab//runs/*/run.json` + +The harness alternates run order per pair (`control->candidate`, then +`candidate->control`) and can stop early on clear win/loss signals. + ## Experimental Knob - `TREEDB_ENABLE_VLOG_GENERATION_PRECHECKPOINT_REWRITE=1` - WAL-off only. diff --git a/scripts/analyze_vlog_maintenance_capacity.py b/scripts/analyze_vlog_maintenance_capacity.py index a719f5cff..292a8a9e8 100755 --- a/scripts/analyze_vlog_maintenance_capacity.py +++ b/scripts/analyze_vlog_maintenance_capacity.py @@ -297,6 +297,12 @@ def build_summary(stats: dict[str, Any]) -> dict[str, Any]: "observed_gc_taken_ids": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.taken_ids"), "observed_gc_runs": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.runs"), "observed_gc_retry_queued": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.retry_queued"), + "observed_gc_retry_dropped": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.retry_dropped"), + "observed_gc_retry_max_attempts": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.retry_max_attempts"), + "observed_gc_latency_completed_ids": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.latency.completed_ids"), + "observed_gc_latency_dropped_ids": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.latency.dropped_ids"), + "observed_gc_latency_total_ms": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.latency.total_ms"), + "observed_gc_latency_max_ms": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.latency.max_ms"), "observed_gc_source_segments_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_segments_total"), "observed_gc_source_segments_eligible_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_segments_eligible_total"), "observed_gc_source_segments_deleted_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_segments_deleted_total"), @@ -375,6 +381,12 @@ def build_summary(stats: dict[str, Any]) -> dict[str, Any]: ) m["observed_gc_drain_pct"] = pct(m["observed_gc_taken_ids"], m["observed_gc_queued_ids"]) + m["observed_gc_latency_finalized_ids"] = m["observed_gc_latency_completed_ids"] + m["observed_gc_latency_dropped_ids"] + m["observed_gc_latency_avg_ms"] = ( + (float(m["observed_gc_latency_total_ms"]) / float(m["observed_gc_latency_finalized_ids"])) + if m["observed_gc_latency_finalized_ids"] > 0 + else 0.0 + ) m["observed_gc_source_segments_eligible_pct"] = pct( m["observed_gc_source_segments_eligible_total"], m["observed_gc_source_segments_total"], @@ -603,6 +615,16 @@ def print_report(summary: dict[str, Any], source_file: Path, run_home: str, inst f"{summary['observed_gc_queued_ids']} / {summary['observed_gc_taken_ids']} / {summary['observed_gc_pending_ids']} " f"(drain={summary['observed_gc_drain_pct']:.1f}%, retries={summary['observed_gc_retry_queued']}, runs={summary['observed_gc_runs']})" ) + print( + " retry budget/latency: " + f"max_attempts={summary['observed_gc_retry_max_attempts']} " + f"retry_dropped={summary['observed_gc_retry_dropped']} " + f"finalized_ids={summary['observed_gc_latency_finalized_ids']} " + f"(completed={summary['observed_gc_latency_completed_ids']}, dropped={summary['observed_gc_latency_dropped_ids']}) " + f"latency total_ms={summary['observed_gc_latency_total_ms']} " + f"avg_ms={summary['observed_gc_latency_avg_ms']:.3f} " + f"max_ms={summary['observed_gc_latency_max_ms']}" + ) print( " observed-source totals: " f"segments total={summary['observed_gc_source_segments_total']} " @@ -659,6 +681,8 @@ def print_report(summary: dict[str, Any], source_file: Path, run_home: str, inst notes.append("rewrite copied stale bytes but immediate reclaim is zero; inspect GC eligibility/protection and post-run rewrite window") if summary["observed_gc_pending_ids"] > 0: notes.append("observed-source GC backlog still pending; may need longer run window or higher checkpoint-kick pressure") + if summary["observed_gc_retry_dropped"] > 0: + notes.append("observed-source GC retries hit max-attempt budget for some IDs; inspect retained-prune throughput and checkpoint-kick cadence") if summary["maintenance_collision_rate_pct"] > 20.0: notes.append("maintenance collision rate is high; lane contention may be throttling rewrite/GC progress") if summary["rewrite_segment_realization_pct"] < 60.0 and summary["rewrite_plan_selected_segments_total"] > 0: diff --git a/scripts/run_celestia_ab.sh b/scripts/run_celestia_ab.sh new file mode 100755 index 000000000..71ef9e3db --- /dev/null +++ b/scripts/run_celestia_ab.sh @@ -0,0 +1,510 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd) +ANALYZER="${ANALYZER:-$ROOT/scripts/analyze_vlog_maintenance_capacity.py}" +RUN_HOME_GLOB="${RUN_HOME_GLOB:-$HOME/.celestia-app-mainnet-treedb-*}" +RUN_CMD="${RUN_CMD:-$HOME/run_celestia.sh}" +CONTROL_ENV_FILE="${CONTROL_ENV_FILE:-}" +CANDIDATE_ENV_FILE="${CANDIDATE_ENV_FILE:-}" +TREEMAP_BIN="${TREEMAP_BIN:-/home/mikers/dev/snissn/celestia-app-p4/build/treemap-local}" +REWRITE_ENABLED="${REWRITE_ENABLED:-1}" +MAX_PAIRS="${MAX_PAIRS:-10}" +MIN_PAIRS="${MIN_PAIRS:-4}" +CLEAR_WIN_PAIRS="${CLEAR_WIN_PAIRS:-3}" +CLEAR_LOSS_PAIRS="${CLEAR_LOSS_PAIRS:-3}" +SIZE_TOLERANCE_BYTES="${SIZE_TOLERANCE_BYTES:-67108864}" +TIME_TOLERANCE_SECONDS="${TIME_TOLERANCE_SECONDS:-120}" +STOP_ON_CLEAR="${STOP_ON_CLEAR:-1}" +SLEEP_BETWEEN_RUNS_SECONDS="${SLEEP_BETWEEN_RUNS_SECONDS:-5}" +TS="$(date +%Y%m%d%H%M%S)" +OUT="${OUT_DIR:-$ROOT/artifacts/celestia_ab/$TS}" + +if ! command -v python3 >/dev/null 2>&1; then + echo "python3 is required" >&2 + exit 1 +fi +if [[ ! -x "$ANALYZER" ]]; then + echo "analyzer not found/executable: $ANALYZER" >&2 + exit 1 +fi +if [[ "$MAX_PAIRS" -lt 1 ]]; then + echo "MAX_PAIRS must be >= 1" >&2 + exit 1 +fi + +mkdir -p "$OUT/runs" + +cat >"$OUT/meta.txt" </dev/null || true +} + +du_bytes() { + local target="$1" + if [[ ! -e "$target" ]]; then + echo 0 + return 0 + fi + if du -sb "$target" >/dev/null 2>&1; then + du -sb "$target" 2>/dev/null | awk '{print $1}' + return 0 + fi + du -sk "$target" 2>/dev/null | awk '{print $1 * 1024}' +} + +detect_new_run_home() { + local before_file="$1" + local -A seen=() + while IFS= read -r path; do + [[ -n "$path" ]] && seen["$path"]=1 + done <"$before_file" + + while IFS= read -r path; do + if [[ -z "$path" ]]; then + continue + fi + if [[ -z "${seen[$path]+x}" ]]; then + echo "$path" + return 0 + fi + done < <(list_run_homes) + + list_run_homes | head -n 1 +} + +run_variant() { + local pair_index="$1" + local variant="$2" + local env_file="$3" + + local run_id + run_id=$(printf "%02d_%s" "$pair_index" "$variant") + local run_dir="$OUT/runs/$run_id" + mkdir -p "$run_dir" + + local before_file="$run_dir/before_homes.txt" + list_run_homes >"$before_file" + + local run_start + run_start=$(date +%s) + ( + set -euo pipefail + if [[ -n "$env_file" ]]; then + # shellcheck source=/dev/null + source "$env_file" + fi + bash -lc "$RUN_CMD" + ) >"$run_dir/launcher.log" 2>&1 + local run_end + run_end=$(date +%s) + + local run_home + run_home="$(detect_new_run_home "$before_file")" + if [[ -z "$run_home" || ! -d "$run_home" ]]; then + echo "failed to detect run home for $run_id" >&2 + exit 1 + fi + + local app_db="$run_home/data/application.db" + local pre_app_bytes pre_wal_bytes + pre_app_bytes="$(du_bytes "$app_db")" + pre_wal_bytes="$(du_bytes "$app_db/maindb/wal")" + + local analyze_json="$run_dir/maintenance.json" + if ! "$ANALYZER" --json "$run_home" >"$analyze_json" 2>"$run_dir/analyze.stderr.log"; then + rm -f "$analyze_json" + fi + + local rewrite_attempted=0 + local rewrite_seconds=0 + local rewrite_rc=0 + if [[ "$REWRITE_ENABLED" == "1" && -x "$TREEMAP_BIN" && -d "$app_db" ]]; then + rewrite_attempted=1 + local rewrite_start + rewrite_start=$(date +%s) + set +e + "$TREEMAP_BIN" vlog-rewrite "$app_db" -rw >"$run_dir/rewrite.log" 2>&1 + rewrite_rc=$? + set -e + local rewrite_end + rewrite_end=$(date +%s) + rewrite_seconds=$((rewrite_end - rewrite_start)) + else + rewrite_rc=0 + fi + + local post_app_bytes post_wal_bytes + post_app_bytes="$(du_bytes "$app_db")" + post_wal_bytes="$(du_bytes "$app_db/maindb/wal")" + + local run_json="$run_dir/run.json" + python3 - "$run_home" "$run_json" "$variant" "$pair_index" "$run_start" "$run_end" "$rewrite_attempted" "$rewrite_seconds" "$rewrite_rc" "$pre_app_bytes" "$pre_wal_bytes" "$post_app_bytes" "$post_wal_bytes" "$analyze_json" <<'PY' +import json +import sys +from pathlib import Path + +run_home = Path(sys.argv[1]) +out_path = Path(sys.argv[2]) +variant = sys.argv[3] +pair_index = int(sys.argv[4]) +run_start = int(sys.argv[5]) +run_end = int(sys.argv[6]) +rewrite_attempted = int(sys.argv[7]) +rewrite_seconds = int(sys.argv[8]) +rewrite_rc = int(sys.argv[9]) +pre_app_bytes = int(sys.argv[10]) +pre_wal_bytes = int(sys.argv[11]) +post_app_bytes = int(sys.argv[12]) +post_wal_bytes = int(sys.argv[13]) +analyze_json_path = Path(sys.argv[14]) + +def parse_sync_time(path: Path) -> dict[str, str]: + out: dict[str, str] = {} + if not path.is_file(): + return out + for line in path.read_text(encoding="utf-8", errors="replace").splitlines(): + line = line.strip() + if not line or line == "---" or "=" not in line: + continue + k, v = line.split("=", 1) + out[k.strip()] = v.strip() + return out + +def safe_int(raw: str | None, default: int = 0) -> int: + if raw is None: + return default + s = str(raw).strip() + if not s: + return default + try: + return int(s) + except Exception: + try: + return int(float(s)) + except Exception: + return default + +sync = parse_sync_time(run_home / "sync" / "sync-time.log") +maintenance = {} +if analyze_json_path.is_file(): + try: + payload = json.loads(analyze_json_path.read_text(encoding="utf-8")) + if isinstance(payload, dict): + summary = payload.get("summary") + if isinstance(summary, dict): + maintenance = summary + except Exception: + maintenance = {} + +t_sync = safe_int(sync.get("duration_seconds"), max(0, run_end - run_start)) +t_rw = rewrite_seconds if rewrite_attempted == 1 else 0 +if rewrite_attempted == 1 and rewrite_rc != 0: + t_total = None +else: + t_total = t_sync + t_rw + +result = { + "pair_index": pair_index, + "variant": variant, + "run_home": str(run_home), + "sync": { + "duration_seconds": t_sync, + "max_rss_kb": safe_int(sync.get("max_rss_kb"), 0), + "max_hwm_kb": safe_int(sync.get("max_hwm_kb"), 0), + "end_app_bytes": safe_int(sync.get("end_app_bytes"), pre_app_bytes), + "end_data_bytes": safe_int(sync.get("end_data_bytes"), 0), + "end_home_bytes": safe_int(sync.get("end_home_bytes"), 0), + }, + "rewrite": { + "attempted": rewrite_attempted == 1, + "seconds": t_rw, + "exit_code": rewrite_rc, + }, + "sizes": { + "sync_app_bytes": pre_app_bytes, + "sync_wal_bytes": pre_wal_bytes, + "post_app_bytes": post_app_bytes, + "post_wal_bytes": post_wal_bytes, + }, + "metrics": { + "t_sync_seconds": t_sync, + "t_rewrite_seconds": t_rw, + "t_total_seconds": t_total, + "s_sync_app_bytes": pre_app_bytes, + "s_sync_wal_bytes": pre_wal_bytes, + "s_post_app_bytes": post_app_bytes, + "s_post_wal_bytes": post_wal_bytes, + "max_rss_kb": safe_int(sync.get("max_rss_kb"), 0), + }, + "maintenance_summary": maintenance, +} +out_path.write_text(json.dumps(result, indent=2, sort_keys=True), encoding="utf-8") +print(out_path) +PY + + echo "run_id=$run_id run_home=$run_home json=$run_json" +} + +aggregate_and_decide() { + local decision_json="$OUT/decision.json" + python3 - "$OUT" "$SIZE_TOLERANCE_BYTES" "$TIME_TOLERANCE_SECONDS" "$MIN_PAIRS" "$CLEAR_WIN_PAIRS" "$CLEAR_LOSS_PAIRS" "$MAX_PAIRS" "$STOP_ON_CLEAR" "$decision_json" <<'PY' +import csv +import json +import sys +from pathlib import Path + +out = Path(sys.argv[1]) +size_tol = int(sys.argv[2]) +time_tol = int(sys.argv[3]) +min_pairs = int(sys.argv[4]) +clear_win_pairs = int(sys.argv[5]) +clear_loss_pairs = int(sys.argv[6]) +max_pairs = int(sys.argv[7]) +stop_on_clear = sys.argv[8] == "1" +decision_path = Path(sys.argv[9]) + +run_files = sorted(out.glob("runs/*/run.json")) +runs = [] +for p in run_files: + try: + runs.append(json.loads(p.read_text(encoding="utf-8"))) + except Exception: + continue + +runs.sort(key=lambda r: (int(r.get("pair_index", 0)), str(r.get("variant", "")))) + +runs_csv = out / "runs.csv" +with runs_csv.open("w", newline="", encoding="utf-8") as fh: + w = csv.writer(fh) + w.writerow([ + "pair_index", + "variant", + "run_home", + "t_sync_seconds", + "t_rewrite_seconds", + "t_total_seconds", + "s_sync_app_bytes", + "s_sync_wal_bytes", + "s_post_app_bytes", + "s_post_wal_bytes", + "max_rss_kb", + "rewrite_exit_code", + "rewrite_runs", + "gc_runs", + "observed_gc_retry_queued", + "observed_gc_retry_dropped", + ]) + for r in runs: + m = r.get("metrics", {}) or {} + s = r.get("sizes", {}) or {} + rw = r.get("rewrite", {}) or {} + summary = r.get("maintenance_summary", {}) or {} + w.writerow([ + int(r.get("pair_index", 0)), + str(r.get("variant", "")), + str(r.get("run_home", "")), + m.get("t_sync_seconds"), + m.get("t_rewrite_seconds"), + m.get("t_total_seconds"), + s.get("sync_app_bytes"), + s.get("sync_wal_bytes"), + s.get("post_app_bytes"), + s.get("post_wal_bytes"), + m.get("max_rss_kb"), + rw.get("exit_code"), + summary.get("rewrite_runs", 0), + summary.get("gc_runs", 0), + summary.get("observed_gc_retry_queued", 0), + summary.get("observed_gc_retry_dropped", 0), + ]) + +by_pair: dict[int, dict[str, dict]] = {} +for r in runs: + pair = int(r.get("pair_index", 0)) + by_pair.setdefault(pair, {})[str(r.get("variant", ""))] = r + +pair_rows = [] +wins = 0 +losses = 0 +for pair in sorted(by_pair): + row = by_pair[pair] + ctrl = row.get("control") + cand = row.get("candidate") + if not ctrl or not cand: + continue + cm = cand.get("metrics", {}) or {} + bm = ctrl.get("metrics", {}) or {} + cand_total = cm.get("t_total_seconds") + base_total = bm.get("t_total_seconds") + cand_post_wal = cm.get("s_post_wal_bytes") + base_post_wal = bm.get("s_post_wal_bytes") + cand_sync = cm.get("t_sync_seconds") + base_sync = bm.get("t_sync_seconds") + cand_sync_app = cm.get("s_sync_app_bytes") + base_sync_app = bm.get("s_sync_app_bytes") + + def delta(a, b): + if a is None or b is None: + return None + return a - b + + d_total = delta(cand_total, base_total) + d_sync = delta(cand_sync, base_sync) + d_post_wal = delta(cand_post_wal, base_post_wal) + d_sync_app = delta(cand_sync_app, base_sync_app) + + outcome = "neutral" + if d_post_wal is not None and d_total is not None: + win = (d_post_wal <= -size_tol) and (d_total <= time_tol) + loss = (d_post_wal >= size_tol) and (d_total >= -time_tol) + if win and not loss: + outcome = "win" + wins += 1 + elif loss and not win: + outcome = "loss" + losses += 1 + + pair_rows.append({ + "pair_index": pair, + "delta_t_sync_seconds": d_sync, + "delta_t_total_seconds": d_total, + "delta_s_sync_app_bytes": d_sync_app, + "delta_s_post_wal_bytes": d_post_wal, + "outcome": outcome, + }) + +pairs_csv = out / "pairs.csv" +with pairs_csv.open("w", newline="", encoding="utf-8") as fh: + w = csv.writer(fh) + w.writerow([ + "pair_index", + "delta_t_sync_seconds", + "delta_t_total_seconds", + "delta_s_sync_app_bytes", + "delta_s_post_wal_bytes", + "outcome", + ]) + for r in pair_rows: + w.writerow([ + r["pair_index"], + r["delta_t_sync_seconds"], + r["delta_t_total_seconds"], + r["delta_s_sync_app_bytes"], + r["delta_s_post_wal_bytes"], + r["outcome"], + ]) + +completed_pairs = len(pair_rows) +reason = "continue" +stop = False +if completed_pairs >= max_pairs: + stop = True + reason = "max_pairs" +elif stop_on_clear and completed_pairs >= min_pairs: + if wins >= clear_win_pairs and wins > losses: + stop = True + reason = "clear_improvement" + elif losses >= clear_loss_pairs and losses > wins: + stop = True + reason = "clear_regression" + +summary_md = out / "summary.md" +lines = [] +lines.append("# run_celestia A/B summary") +lines.append("") +lines.append(f"- completed pairs: `{completed_pairs}`") +lines.append(f"- wins/losses/neutral: `{wins}` / `{losses}` / `{max(0, completed_pairs - wins - losses)}`") +lines.append(f"- size tolerance bytes: `{size_tol}`") +lines.append(f"- time tolerance seconds: `{time_tol}`") +lines.append(f"- decision: `{reason}`") +lines.append("") +lines.append("## Artifacts") +lines.append("") +lines.append(f"- runs csv: `{runs_csv}`") +lines.append(f"- pairs csv: `{pairs_csv}`") +lines.append(f"- per-run json: `{out / 'runs'}`") +if pair_rows: + last = pair_rows[-1] + lines.append("") + lines.append("## Last Pair") + lines.append("") + lines.append(f"- pair: `{last['pair_index']}` outcome=`{last['outcome']}`") + lines.append(f"- delta_t_sync_seconds: `{last['delta_t_sync_seconds']}`") + lines.append(f"- delta_t_total_seconds: `{last['delta_t_total_seconds']}`") + lines.append(f"- delta_s_sync_app_bytes: `{last['delta_s_sync_app_bytes']}`") + lines.append(f"- delta_s_post_wal_bytes: `{last['delta_s_post_wal_bytes']}`") +summary_md.write_text("\n".join(lines) + "\n", encoding="utf-8") + +payload = { + "completed_pairs": completed_pairs, + "wins": wins, + "losses": losses, + "neutral": max(0, completed_pairs - wins - losses), + "stop": stop, + "reason": reason, +} +decision_path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8") +print(json.dumps(payload, sort_keys=True)) +PY +} + +run_pair() { + local pair_index="$1" + if (( pair_index % 2 == 1 )); then + run_variant "$pair_index" "control" "$CONTROL_ENV_FILE" + sleep "$SLEEP_BETWEEN_RUNS_SECONDS" + run_variant "$pair_index" "candidate" "$CANDIDATE_ENV_FILE" + else + run_variant "$pair_index" "candidate" "$CANDIDATE_ENV_FILE" + sleep "$SLEEP_BETWEEN_RUNS_SECONDS" + run_variant "$pair_index" "control" "$CONTROL_ENV_FILE" + fi +} + +echo "output=$OUT" +echo "run_cmd=$RUN_CMD" + +decision_reason="continue" +for ((pair = 1; pair <= MAX_PAIRS; pair++)); do + echo "pair=$pair start" + run_pair "$pair" + aggregate_and_decide + decision_reason="$(python3 - "$OUT/decision.json" <<'PY' +import json +import sys +payload = json.loads(open(sys.argv[1], 'r', encoding='utf-8').read()) +print(payload.get('reason', 'continue')) +print('1' if payload.get('stop') else '0') +PY +)" + stop_flag="$(echo "$decision_reason" | tail -n 1)" + decision_reason="$(echo "$decision_reason" | head -n 1)" + echo "pair=$pair decision=$decision_reason" + if [[ "$stop_flag" == "1" ]]; then + break + fi + sleep "$SLEEP_BETWEEN_RUNS_SECONDS" +done + +echo "completed decision=$decision_reason" +echo "summary=$OUT/summary.md" +echo "runs_csv=$OUT/runs.csv" +echo "pairs_csv=$OUT/pairs.csv" From 491559139ed94ed5ed9fae0a78643303fc2e1878 Mon Sep 17 00:00:00 2001 From: Mikers Date: Sat, 28 Mar 2026 12:23:22 -1000 Subject: [PATCH 39/61] bench: export env file vars in run_celestia a/b harness --- scripts/run_celestia_ab.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/run_celestia_ab.sh b/scripts/run_celestia_ab.sh index 71ef9e3db..d238e63af 100755 --- a/scripts/run_celestia_ab.sh +++ b/scripts/run_celestia_ab.sh @@ -109,7 +109,9 @@ run_variant() { set -euo pipefail if [[ -n "$env_file" ]]; then # shellcheck source=/dev/null + set -a source "$env_file" + set +a fi bash -lc "$RUN_CMD" ) >"$run_dir/launcher.log" 2>&1 From 27e7fe34311c8be9a2246f2407ac01027499955a Mon Sep 17 00:00:00 2001 From: Mikers Date: Sat, 28 Mar 2026 12:26:58 -1000 Subject: [PATCH 40/61] bench: avoid login-shell startup in celestia a/b harness --- scripts/run_celestia_ab.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/run_celestia_ab.sh b/scripts/run_celestia_ab.sh index d238e63af..73b691921 100755 --- a/scripts/run_celestia_ab.sh +++ b/scripts/run_celestia_ab.sh @@ -113,7 +113,9 @@ run_variant() { source "$env_file" set +a fi - bash -lc "$RUN_CMD" + # Non-login shell avoids user profile side effects (e.g. tty-dependent exports) + # that can fail under nohup/background runs. + bash -c "$RUN_CMD" ) >"$run_dir/launcher.log" 2>&1 local run_end run_end=$(date +%s) From 1c216249fd20b9f578e7682b40fa33077f454e3c Mon Sep 17 00:00:00 2001 From: Mikers Date: Sat, 28 Mar 2026 14:51:36 -1000 Subject: [PATCH 41/61] caching: reduce retry gc churn in live maintenance --- TreeDB/caching/db.go | 65 ++++++++++++---- .../caching/vlog_generation_scheduler_test.go | 76 +++++++++++++------ 2 files changed, 105 insertions(+), 36 deletions(-) diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go index 10ff71d72..139119a46 100644 --- a/TreeDB/caching/db.go +++ b/TreeDB/caching/db.go @@ -5893,6 +5893,7 @@ type DB struct { vlogGenerationLastRewritePlanUnixNano atomic.Int64 vlogGenerationLastRewriteUnixNano atomic.Int64 vlogGenerationLastGCUnixNano atomic.Int64 + vlogGenerationLastGCNoopUnixNano atomic.Int64 vlogGenerationLastCheckpointKickUnixNano atomic.Int64 vlogGenerationLastGCDryRunUnixNano atomic.Int64 vlogGenerationLastGCDryRunBytesEligible atomic.Int64 @@ -6096,6 +6097,7 @@ const ( vlogGenerationGCMinBytes = int64(1 << 20) vlogGenerationRewriteMinInterval = 30 * time.Second vlogGenerationGCMinInterval = 45 * time.Second + vlogGenerationGCNoopMinInterval = 3 * time.Minute vlogGenerationCheckpointKickMinInterval = 5 * time.Second vlogGenerationCheckpointKickRetryWindow = 5 * time.Second vlogGenerationDeferredRetryWindow = 30 * time.Second @@ -12959,17 +12961,16 @@ func (db *DB) maybeRunPeriodicVlogGenerationMaintenance(runGC bool) bool { return false } // Coarse preflight: while foreground activity is hot, avoid entering the - // maintenance engine unless a deferred/checkpoint wake is pending. This - // prevents high-frequency periodic no-op acquisitions. - if !runGC { - now := time.Now() - quiet := db.foregroundActivityQuietFor(now, vlogGenerationMaintenanceQuietWindow, vlogForegroundReadQuietWindow) - if !quiet && - !db.vlogGenerationCheckpointKickPending.Load() && - !db.vlogGenerationDeferredMaintenancePending.Load() && - !db.vlogGenerationDeferredMaintenanceDue(now) { - return false - } + // maintenance engine unless a deferred/checkpoint wake is pending. Apply this + // to both rewrite and periodic GC ticks; otherwise runGC ticks can still + // issue expensive full scans every interval during restore-heavy sync phases. + now := time.Now() + quiet := db.foregroundActivityQuietFor(now, vlogGenerationMaintenanceQuietWindow, vlogForegroundReadQuietWindow) + if !quiet && + !db.vlogGenerationCheckpointKickPending.Load() && + !db.vlogGenerationDeferredMaintenancePending.Load() && + !db.vlogGenerationDeferredMaintenanceDue(now) { + return false } db.maybeRunVlogGenerationMaintenance(runGC) return true @@ -13900,7 +13901,7 @@ func (db *DB) scheduleDueVlogGenerationDeferredMaintenance() { } func (db *DB) runVlogGenerationCheckpointKickRetries(opts vlogGenerationMaintenanceOptions) { - db.runVlogGenerationMaintenanceRetries(opts, vlogGenerationCheckpointKickRetryWindow, false) + db.runVlogGenerationMaintenanceRetries(opts, vlogGenerationCheckpointKickRetryWindow, true) } func (db *DB) runVlogGenerationMaintenanceRetries(opts vlogGenerationMaintenanceOptions, retryWindow time.Duration, stopWhenAcquired bool) { @@ -13951,7 +13952,11 @@ func (db *DB) runVlogGenerationMaintenanceRetries(opts vlogGenerationMaintenance db.vlogGenerationMaintenanceActive.Load(), ) } - ran := db.maybeRunVlogGenerationMaintenanceWithOptions(true, opts) + // Retry-driven maintenance (checkpoint kick / deferred stage confirmation) + // prioritizes rewrite debt progress. Keep periodic/full-scan GC on the + // normal scheduler path to avoid introducing long full-scan stalls on hot + // checkpoint-triggered retries. + ran := db.maybeRunVlogGenerationMaintenanceWithOptions(false, opts) if stopWhenAcquired && ran { if opts.debugSource != "" { db.debugVlogMaintf( @@ -15120,6 +15125,12 @@ planned: observedSourceGCIDs := db.takeVlogGenerationObservedSourceGCList() forceObservedSourceGC := len(observedSourceGCIDs) > 0 + if !runGC && opts.bypassQuiet && !forceObservedSourceGC { + // Checkpoint-kick/deferred retry passes are rewrite-priority. Do not run + // opportunistic GC here unless we are replaying observed-source IDs from + // a prior rewrite/GC cycle. + return + } if envBool(envDisableVlogGenerationGC) { db.debugVlogMaintf( "gc_skip reason=disabled_env run_gc=%t force_observed=%t observed_ids=%d", @@ -15170,7 +15181,9 @@ planned: } return } - needEligibilityEstimate := !runGC && !forceObservedSourceGC && !db.shouldRunVlogGenerationGC(retained, reclaimable, churnBps) + // Retry-driven checkpoint/deferred passes are rewrite-priority paths. Avoid + // issuing GC dry-run scans there; let periodic/manual GC decide eligibility. + needEligibilityEstimate := !runGC && !opts.bypassQuiet && !forceObservedSourceGC && !db.shouldRunVlogGenerationGC(retained, reclaimable, churnBps) now = time.Now() lastGC := db.vlogGenerationLastGCUnixNano.Load() if lastGC > 0 { @@ -15187,6 +15200,21 @@ planned: return } } + if !forceObservedSourceGC { + lastNoop := db.vlogGenerationLastGCNoopUnixNano.Load() + if lastNoop > 0 { + lastNoopAt := time.Unix(0, lastNoop) + if now.Sub(lastNoopAt) < vlogGenerationGCNoopMinInterval { + db.debugVlogMaintf( + "gc_skip reason=noop_cooldown run_gc=%t since_ms=%.3f min_ms=%.3f", + runGC, + float64(now.Sub(lastNoopAt).Microseconds())/1000, + float64(vlogGenerationGCNoopMinInterval.Microseconds())/1000, + ) + return + } + } + } db.vlogGenerationSchedulerState.Store(vlogGenerationSchedulerRunning) db.vlogGenerationLastReason.Store(vlogGenerationReasonPeriodicGC) err = db.runWithBackendMaintenanceOptions(backendMaintenanceOptions{ @@ -15253,6 +15281,15 @@ planned: gcStats.ObservedSourceSegmentsProtectedRetained, ) db.observeVlogGenerationGCStats(gcStats) + if !forceObservedSourceGC && + gcStats.BytesDeleted == 0 && + gcStats.SegmentsDeleted == 0 && + gcStats.BytesEligible == 0 && + gcStats.SegmentsEligible == 0 { + db.vlogGenerationLastGCNoopUnixNano.Store(now.UnixNano()) + } else { + db.vlogGenerationLastGCNoopUnixNano.Store(0) + } if gcStats.BytesProtectedRetained > 0 && gcStats.BytesEligible == 0 && db.valueLogRetainedClosedBytes.Load() > 0 { // When GC classifies all reclaim blockers as retained-path protection, // trigger an eager retained prune pass to release stale lifecycle pins. diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go index 1ee40dd58..f25b8b4dd 100644 --- a/TreeDB/caching/vlog_generation_scheduler_test.go +++ b/TreeDB/caching/vlog_generation_scheduler_test.go @@ -4309,8 +4309,8 @@ func TestCheckpoint_KicksVlogGenerationRewriteDespiteRecentForegroundActivity(t if _, calls := recorder.recordedPlan(); calls != 1 { t.Fatalf("plan calls=%d want=1", calls) } - if got := db.checkpointRuns.Load(); got < 2 { - t.Fatalf("checkpoint runs=%d want >=2", got) + if got := db.checkpointRuns.Load(); got < 1 { + t.Fatalf("checkpoint runs=%d want >=1", got) } stats := db.Stats() if got := stats["treedb.cache.vlog_generation.checkpoint_kick.runs"]; got != "1" { @@ -4489,7 +4489,7 @@ func TestCheckpoint_KickSelfDrainsMaintenanceCollision(t *testing.T) { <-release } -func TestCheckpoint_KicksVlogGenerationGCDespiteRecentForegroundActivity(t *testing.T) { +func TestCheckpoint_KickDoesNotForceGCDuringRecentForegroundActivity(t *testing.T) { disableVlogGenerationLoop(t) t.Setenv(envDisableVlogGenerationRewrite, "1") @@ -4536,31 +4536,19 @@ func TestCheckpoint_KicksVlogGenerationGCDespiteRecentForegroundActivity(t *test t.Fatalf("checkpoint: %v", err) } - deadline := time.Now().Add(2 * schedulerTestWait(t)) - for { - _, realCalls, _ := recorder.recordedCalls() - if realCalls == 1 { - break - } - if time.Now().After(deadline) { - dryCalls, realCalls, _ := recorder.recordedCalls() - t.Fatalf("checkpoint kick did not run gc in time: dryCalls=%d realCalls=%d", dryCalls, realCalls) - } - time.Sleep(10 * time.Millisecond) + time.Sleep(150 * time.Millisecond) + if dryCalls, realCalls, _ := recorder.recordedCalls(); dryCalls != 0 || realCalls != 0 { + t.Fatalf("gc calls dry=%d real=%d want dry=0 real=0", dryCalls, realCalls) } - - if dryCalls, realCalls, _ := recorder.recordedCalls(); dryCalls != 0 || realCalls != 1 { - t.Fatalf("gc calls dry=%d real=%d want dry=0 real=1", dryCalls, realCalls) - } - if got := db.checkpointRuns.Load(); got < 2 { - t.Fatalf("checkpoint runs=%d want >=2", got) + if got := db.checkpointRuns.Load(); got != 1 { + t.Fatalf("checkpoint runs=%d want 1", got) } stats := db.Stats() if got := stats["treedb.cache.vlog_generation.checkpoint_kick.runs"]; got != "1" { t.Fatalf("checkpoint kick runs=%q want 1", got) } - if got := stats["treedb.cache.vlog_generation.checkpoint_kick.gc_runs"]; got != "1" { - t.Fatalf("checkpoint kick gc runs=%q want 1", got) + if got := stats["treedb.cache.vlog_generation.checkpoint_kick.gc_runs"]; got != "0" { + t.Fatalf("checkpoint kick gc runs=%q want 0", got) } if got := stats["treedb.cache.vlog_generation.checkpoint_kick.active"]; got != "false" { t.Fatalf("checkpoint kick active=%q want false", got) @@ -4597,6 +4585,50 @@ func TestVlogGenerationMaintenance_PeriodicGCSkipsWhileRewriteAgeBlocked(t *test } } +func TestVlogGenerationMaintenance_PeriodicGCNoopCooldown(t *testing.T) { + prepareDirectSchedulerTest(t) + t.Setenv(envDisableVlogGenerationRewrite, "1") + + dir := t.TempDir() + + backend, err := backenddb.Open(backenddb.Options{Dir: dir}) + if err != nil { + t.Fatalf("open backend: %v", err) + } + recorder := &rewriteBudgetRecordingBackend{ + DB: backend, + gcResponse: backenddb.ValueLogGCStats{}, + } + + db, cleanup := openRewriteQueueTestDB(t, dir, recorder) + t.Cleanup(cleanup) + skipRetainedPrune(db) + forceVlogMaintenanceIdle(db) + + quietSince := time.Now().Add(-2 * vlogGenerationMaintenanceQuietWindow).UnixNano() + db.lastForegroundWriteUnixNano.Store(quietSince) + db.lastForegroundReadUnixNano.Store(quietSince) + db.activeForegroundIterators.Store(0) + + db.maybeRunVlogGenerationMaintenance(true) + + if _, calls := recorder.recordedGC(); calls != 1 { + t.Fatalf("first periodic GC calls=%d want=1", calls) + } + if got := db.vlogGenerationLastGCNoopUnixNano.Load(); got <= 0 { + t.Fatalf("last GC noop unix nano=%d want >0 after zero-eligibility pass", got) + } + + // Bypass the normal min-interval gate; noop cooldown should still suppress. + db.vlogGenerationLastGCUnixNano.Store(time.Now().Add(-2 * vlogGenerationGCMinInterval).UnixNano()) + forceVlogMaintenanceIdle(db) + db.maybeRunVlogGenerationMaintenance(true) + + if _, calls := recorder.recordedGC(); calls != 1 { + t.Fatalf("periodic GC should skip under noop cooldown; calls=%d want=1", calls) + } +} + func TestVlogGenerationMaintenance_PeriodicGCSkipsInWALOnMode(t *testing.T) { prepareDirectSchedulerTest(t) From 70eb677594bb72f5cabf2f8ccea8f83e075db4c7 Mon Sep 17 00:00:00 2001 From: Mikers Date: Sat, 28 Mar 2026 15:54:42 -1000 Subject: [PATCH 42/61] bench: add celestia fast-gate loop and low-signal stop rules --- docs/benchmarks/CELESTIA_COMPRESSION_LOOP.md | 118 +++ scripts/celestia_fast_gate.sh | 780 +++++++++++++++++++ scripts/run_celestia_ab.sh | 46 +- 3 files changed, 936 insertions(+), 8 deletions(-) create mode 100644 docs/benchmarks/CELESTIA_COMPRESSION_LOOP.md create mode 100755 scripts/celestia_fast_gate.sh diff --git a/docs/benchmarks/CELESTIA_COMPRESSION_LOOP.md b/docs/benchmarks/CELESTIA_COMPRESSION_LOOP.md new file mode 100644 index 000000000..25ba010c0 --- /dev/null +++ b/docs/benchmarks/CELESTIA_COMPRESSION_LOOP.md @@ -0,0 +1,118 @@ +# Celestia Compression Iteration Loop + +This loop exists to avoid slow, low-signal experimentation. + +Primary objective: +- Reduce on-disk `application.db` bytes. + +Secondary objectives: +- Keep combined wall time (`sync + rewrite`) bounded. +- Avoid memory regressions (`max_rss`). +- Keep gzip as a sanity check, not the primary objective. + +## Stage 0: Hypothesis Contract (Required) + +Before running anything expensive, define: +- hypothesis: what changed and why it should help +- expected effect size: minimum size delta worth promoting +- time budget: max acceptable wall-time regression +- rollback condition: what result means we stop and redesign + +If expected effect size is below threshold, do not run full `run_celestia` yet. + +## Stage 1: Fast Gate (Default Iteration Loop) + +Use `scripts/celestia_fast_gate.sh` for fast interleaved control/candidate A/B. + +What it measures per run: +- pre-rewrite size: `sync_app`, `sync_wal`, optional `sync_gzip` +- post-rewrite size: `post_app`, `post_wal`, optional `post_gzip` +- timing: benchmark duration + rewrite duration + total +- throughput: batch-write ops/sec from unified-bench output + +Defaults chosen for celestia-like pressure: +- `-profile fast` +- `-val-pattern celestia_height_prefix_fill` +- dict compression enabled +- dict defaults passed explicitly: + - `-treedb-vlog-dict-train-bytes=1048576` + - `-treedb-vlog-dict-dict-bytes=32768` + +Fast-gate anti-loop safeguards: +- interleaved order alternates each pair (bias reduction) +- early clear stop (improvement/regression) +- futility stop when remaining pairs cannot reach a clear decision +- low-signal stop on neutral-streak threshold +- per-run process review artifact (`process_review.md`) + +Example: + +```bash +MAX_PAIRS=6 \ +MIN_PAIRS=3 \ +CLEAR_WIN_PAIRS=2 \ +CLEAR_LOSS_PAIRS=2 \ +LOW_SIGNAL_MIN_PAIRS=3 \ +LOW_SIGNAL_NEUTRAL_STREAK=3 \ +SIZE_FIELD=s_post_app_bytes \ +SIZE_TOLERANCE_BYTES=$((64<<20)) \ +TIME_TOLERANCE_SECONDS=30 \ +./scripts/celestia_fast_gate.sh +``` + +Outputs: +- `summary.md` +- `process_review.md` +- `runs.csv` +- `pairs.csv` +- per-run `run.json` + +## Stage 2: Pprof/Implementation Efficiency Pass + +Run this stage before full `run_celestia` if fast gate shows: +- promising size gains with time regression, or +- ambiguous neutral outcomes near threshold. + +Goal: +- remove avoidable implementation overhead (copying/alloc/lock contention) +- preserve size gains while pulling time back inside budget + +## Stage 3: Full `run_celestia` A/B Confirmation + +Only promote candidates that pass Stage 1 and Stage 2. + +Use `scripts/run_celestia_ab.sh` with interleaved pairs and stop rules. + +Now includes anti-loop safeguards: +- clear stop (improvement/regression) +- futility stop (`futile_remaining_pairs`) +- low-signal neutral-streak stop (`low_signal_neutral_streak`) + +Example: + +```bash +MAX_PAIRS=4 \ +MIN_PAIRS=3 \ +CLEAR_WIN_PAIRS=2 \ +CLEAR_LOSS_PAIRS=2 \ +LOW_SIGNAL_MIN_PAIRS=3 \ +LOW_SIGNAL_NEUTRAL_STREAK=3 \ +REWRITE_ENABLED=1 \ +./scripts/run_celestia_ab.sh +``` + +## Process Review Cadence + +Review and revise the loop after every decision event: +- `clear_improvement` +- `clear_regression` +- `futile_remaining_pairs` +- `low_signal_neutral_streak` + +Required review questions: +- Was the fast gate predictive of full-run direction? +- Were thresholds too strict or too loose for current goals? +- Did we spend time validating changes below meaningful effect size? +- Is the next candidate large enough to justify promotion? + +If two consecutive campaigns end in low-signal/futility, tighten promotion gates and bundle larger candidate deltas before next full run. diff --git a/scripts/celestia_fast_gate.sh b/scripts/celestia_fast_gate.sh new file mode 100755 index 000000000..f79292e0f --- /dev/null +++ b/scripts/celestia_fast_gate.sh @@ -0,0 +1,780 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd) +cd "$ROOT" + +BASELINE_HASH="${BASELINE_HASH:-origin/main}" +CANDIDATE_UNIFIED_BIN="${CANDIDATE_UNIFIED_BIN:-}" +CANDIDATE_TREEMAP_BIN="${CANDIDATE_TREEMAP_BIN:-}" +BASELINE_UNIFIED_BIN="${BASELINE_UNIFIED_BIN:-}" +BASELINE_TREEMAP_BIN="${BASELINE_TREEMAP_BIN:-}" +SCRIPT_GOWORK="${SCRIPT_GOWORK:-off}" + +MAX_PAIRS="${MAX_PAIRS:-6}" +MIN_PAIRS="${MIN_PAIRS:-3}" +CLEAR_WIN_PAIRS="${CLEAR_WIN_PAIRS:-2}" +CLEAR_LOSS_PAIRS="${CLEAR_LOSS_PAIRS:-2}" +STOP_ON_CLEAR="${STOP_ON_CLEAR:-1}" +LOW_SIGNAL_MIN_PAIRS="${LOW_SIGNAL_MIN_PAIRS:-3}" +LOW_SIGNAL_NEUTRAL_STREAK="${LOW_SIGNAL_NEUTRAL_STREAK:-3}" +SLEEP_BETWEEN_RUNS_SECONDS="${SLEEP_BETWEEN_RUNS_SECONDS:-2}" + +SIZE_FIELD="${SIZE_FIELD:-s_post_app_bytes}" +SIZE_TOLERANCE_BYTES="${SIZE_TOLERANCE_BYTES:-67108864}" +TIME_TOLERANCE_SECONDS="${TIME_TOLERANCE_SECONDS:-30}" + +PROFILE="${PROFILE:-fast}" +DBS="${DBS:-treedb}" +TESTS="${TESTS:-batch_write}" +KEYS="${KEYS:-500000}" +VALSIZE="${VALSIZE:-128}" +BATCHSIZE="${BATCHSIZE:-8000}" +VAL_PATTERN="${VAL_PATTERN:-celestia_height_prefix_fill}" +SEED="${SEED:-1}" + +FORCE_VALUE_POINTERS="${FORCE_VALUE_POINTERS:-true}" +OUTER_LEAVES_IN_VLOG="${OUTER_LEAVES_IN_VLOG:-true}" +VLOG_COMPRESSION="${VLOG_COMPRESSION:-dict}" +VLOG_COMPRESSION_AUTOTUNE="${VLOG_COMPRESSION_AUTOTUNE:-aggressive}" +VLOG_COMPRESSION_VARIANT="${VLOG_COMPRESSION_VARIANT:-dict}" +DICT_TRAIN_BYTES="${DICT_TRAIN_BYTES:-1048576}" +DICT_BYTES="${DICT_BYTES:-32768}" + +REWRITE_ENABLED="${REWRITE_ENABLED:-1}" +REWRITE_ARGS="${REWRITE_ARGS:--rw}" +MEASURE_GZIP="${MEASURE_GZIP:-1}" +KEEP_DB_DIRS="${KEEP_DB_DIRS:-1}" + +COMMON_EXTRA_FLAGS="${COMMON_EXTRA_FLAGS:-}" +CONTROL_EXTRA_FLAGS="${CONTROL_EXTRA_FLAGS:-}" +CANDIDATE_EXTRA_FLAGS="${CANDIDATE_EXTRA_FLAGS:-}" + +TS="$(date +%Y%m%d%H%M%S)" +OUT="${OUT_DIR:-$ROOT/artifacts/celestia_fast_gate/$TS}" + +WORKTREE_PATH="" + +require_cmd() { + if ! command -v "$1" >/dev/null 2>&1; then + echo "missing required command: $1" >&2 + exit 2 + fi +} + +du_bytes() { + local target="$1" + if [[ ! -e "$target" ]]; then + echo 0 + return 0 + fi + if du -sb "$target" >/dev/null 2>&1; then + du -sb "$target" 2>/dev/null | awk '{print $1}' + return 0 + fi + du -sk "$target" 2>/dev/null | awk '{print $1 * 1024}' +} + +gzip_dir_bytes() { + local target="$1" + if [[ "$MEASURE_GZIP" != "1" ]]; then + echo 0 + return 0 + fi + if [[ ! -d "$target" ]]; then + echo 0 + return 0 + fi + tar -C "$target" -cf - . 2>/dev/null | gzip -1 -c | wc -c | tr -d '[:space:]' +} + +cleanup() { + if [[ -n "$WORKTREE_PATH" && -d "$WORKTREE_PATH" ]]; then + git worktree remove --force "$WORKTREE_PATH" >/dev/null 2>&1 || true + fi +} +trap cleanup EXIT + +parse_bench_log() { + local log_path="$1" + python3 - "$log_path" <<'PY' +import re +import sys +from pathlib import Path + +path = Path(sys.argv[1]) +text = path.read_text(encoding="utf-8", errors="replace") +lines = text.splitlines() + +throughput = None +for line in lines: + m = re.search(r"Batch Write\s*/\s*TreeDB[^=]*=\s*([0-9][0-9,]*(?:\.[0-9]+)?)", line) + if m: + throughput = float(m.group(1).replace(",", "")) + break +if throughput is None: + for line in lines: + m = re.match(r"\s*Batch Write\s+([0-9][0-9,]*(?:\.[0-9]+)?)\s*$", line) + if m: + throughput = float(m.group(1).replace(",", "")) + break + +keep_dir = "" +in_keep_block = False +for line in lines: + stripped = line.strip() + if stripped == "Kept Data Directories": + in_keep_block = True + continue + if in_keep_block: + if not stripped: + continue + if stripped.startswith("TreeDB (") and ":" in stripped: + maybe = stripped.split(":", 1)[1].strip() + if maybe.startswith("/"): + keep_dir = maybe + break + +if not keep_dir: + m = re.search(r"TreeDB \([^\)]*\):\s+(/tmp/bench[^\s]+)", text) + if m: + keep_dir = m.group(1) + +if not keep_dir: + raise SystemExit("unable to locate kept data directory in unified-bench output") + +if throughput is None: + throughput = 0.0 + +print(f"{keep_dir}\t{throughput}") +PY +} + +setup_bins() { + mkdir -p "$OUT/bin" "$OUT/worktrees" "$OUT/runs" + + if [[ -z "$CANDIDATE_UNIFIED_BIN" ]]; then + CANDIDATE_UNIFIED_BIN="$OUT/bin/unified-bench-candidate" + GOWORK="$SCRIPT_GOWORK" go build -o "$CANDIDATE_UNIFIED_BIN" ./cmd/unified_bench + fi + if [[ ! -x "$CANDIDATE_UNIFIED_BIN" ]]; then + echo "candidate unified-bench binary not executable: $CANDIDATE_UNIFIED_BIN" >&2 + exit 2 + fi + + if [[ -z "$CANDIDATE_TREEMAP_BIN" ]]; then + CANDIDATE_TREEMAP_BIN="$OUT/bin/treemap-candidate" + GOWORK="$SCRIPT_GOWORK" go build -o "$CANDIDATE_TREEMAP_BIN" ./TreeDB/cmd/treemap + fi + if [[ ! -x "$CANDIDATE_TREEMAP_BIN" ]]; then + echo "candidate treemap binary not executable: $CANDIDATE_TREEMAP_BIN" >&2 + exit 2 + fi + + if [[ -n "$BASELINE_UNIFIED_BIN" && -n "$BASELINE_TREEMAP_BIN" ]]; then + if [[ ! -x "$BASELINE_UNIFIED_BIN" ]]; then + echo "baseline unified-bench binary not executable: $BASELINE_UNIFIED_BIN" >&2 + exit 2 + fi + if [[ ! -x "$BASELINE_TREEMAP_BIN" ]]; then + echo "baseline treemap binary not executable: $BASELINE_TREEMAP_BIN" >&2 + exit 2 + fi + return 0 + fi + + if ! git cat-file -e "${BASELINE_HASH}^{commit}" >/dev/null 2>&1; then + git fetch --no-tags --depth=1 origin "$BASELINE_HASH" >/dev/null 2>&1 || git fetch --no-tags origin "$BASELINE_HASH" >/dev/null 2>&1 + fi + + WORKTREE_PATH="$OUT/worktrees/baseline" + git worktree add --detach "$WORKTREE_PATH" "$BASELINE_HASH" >/dev/null + + if [[ -z "$BASELINE_UNIFIED_BIN" ]]; then + BASELINE_UNIFIED_BIN="$OUT/bin/unified-bench-baseline" + ( + cd "$WORKTREE_PATH" + GOWORK="$SCRIPT_GOWORK" go build -o "$BASELINE_UNIFIED_BIN" ./cmd/unified_bench + ) + fi + if [[ -z "$BASELINE_TREEMAP_BIN" ]]; then + BASELINE_TREEMAP_BIN="$OUT/bin/treemap-baseline" + ( + cd "$WORKTREE_PATH" + GOWORK="$SCRIPT_GOWORK" go build -o "$BASELINE_TREEMAP_BIN" ./TreeDB/cmd/treemap + ) + fi + + if [[ ! -x "$BASELINE_UNIFIED_BIN" ]]; then + echo "baseline unified-bench binary not executable: $BASELINE_UNIFIED_BIN" >&2 + exit 2 + fi + if [[ ! -x "$BASELINE_TREEMAP_BIN" ]]; then + echo "baseline treemap binary not executable: $BASELINE_TREEMAP_BIN" >&2 + exit 2 + fi +} + +run_variant() { + local pair_index="$1" + local variant="$2" + + local bench_bin treemap_bin extra_flags + if [[ "$variant" == "candidate" ]]; then + bench_bin="$CANDIDATE_UNIFIED_BIN" + treemap_bin="$CANDIDATE_TREEMAP_BIN" + extra_flags="$CANDIDATE_EXTRA_FLAGS" + else + bench_bin="$BASELINE_UNIFIED_BIN" + treemap_bin="$BASELINE_TREEMAP_BIN" + extra_flags="$CONTROL_EXTRA_FLAGS" + fi + + local run_id + run_id=$(printf "%02d_%s" "$pair_index" "$variant") + local run_dir="$OUT/runs/$run_id" + mkdir -p "$run_dir" + + local cmd=( + "$bench_bin" + -profile "$PROFILE" + -dbs "$DBS" + -keys "$KEYS" + -valsize "$VALSIZE" + -batchsize "$BATCHSIZE" + -test "$TESTS" + -val-pattern "$VAL_PATTERN" + -seed "$SEED" + -progress=false + -keep + -treedb-force-value-pointers="$FORCE_VALUE_POINTERS" + -treedb-index-outer-leaves-in-vlog="$OUTER_LEAVES_IN_VLOG" + -treedb-vlog-compression "$VLOG_COMPRESSION" + -treedb-vlog-compression-autotune "$VLOG_COMPRESSION_AUTOTUNE" + -treedb-vlog-compression-variant "$VLOG_COMPRESSION_VARIANT" + -treedb-vlog-dict-train-bytes "$DICT_TRAIN_BYTES" + -treedb-vlog-dict-dict-bytes "$DICT_BYTES" + ) + + if [[ -n "$COMMON_EXTRA_FLAGS" ]]; then + # shellcheck disable=SC2206 + local common_extra=( $COMMON_EXTRA_FLAGS ) + cmd+=("${common_extra[@]}") + fi + if [[ -n "$extra_flags" ]]; then + # shellcheck disable=SC2206 + local variant_extra=( $extra_flags ) + cmd+=("${variant_extra[@]}") + fi + + printf '%q ' "${cmd[@]}" >"$run_dir/cmd.txt" + echo >>"$run_dir/cmd.txt" + + local bench_log="$run_dir/unified.log" + local run_start run_end + run_start=$(date +%s) + "${cmd[@]}" >"$bench_log" 2>&1 + run_end=$(date +%s) + + local parse_out keep_dir batch_write_ops + parse_out="$(parse_bench_log "$bench_log")" + keep_dir="${parse_out%%$'\t'*}" + batch_write_ops="${parse_out#*$'\t'}" + + if [[ -z "$keep_dir" || ! -d "$keep_dir" ]]; then + echo "missing kept dir for $run_id (parsed=$keep_dir)" >&2 + exit 1 + fi + + local sync_app_bytes sync_wal_bytes sync_gzip_bytes + sync_app_bytes="$(du_bytes "$keep_dir")" + sync_wal_bytes="$(du_bytes "$keep_dir/maindb/wal")" + sync_gzip_bytes="$(gzip_dir_bytes "$keep_dir")" + + local rewrite_attempted=0 + local rewrite_seconds=0 + local rewrite_rc=0 + local rewrite_log="$run_dir/rewrite.log" + if [[ "$REWRITE_ENABLED" == "1" ]]; then + rewrite_attempted=1 + local rw_start rw_end + rw_start=$(date +%s) + # shellcheck disable=SC2206 + local rw_args=( $REWRITE_ARGS ) + set +e + "$treemap_bin" vlog-rewrite "$keep_dir" "${rw_args[@]}" >"$rewrite_log" 2>&1 + rewrite_rc=$? + set -e + rw_end=$(date +%s) + rewrite_seconds=$((rw_end - rw_start)) + fi + + local post_app_bytes post_wal_bytes post_gzip_bytes + post_app_bytes="$(du_bytes "$keep_dir")" + post_wal_bytes="$(du_bytes "$keep_dir/maindb/wal")" + post_gzip_bytes="$(gzip_dir_bytes "$keep_dir")" + + local run_json="$run_dir/run.json" + python3 - "$run_json" "$pair_index" "$variant" "$run_start" "$run_end" "$rewrite_attempted" "$rewrite_seconds" "$rewrite_rc" "$batch_write_ops" "$keep_dir" "$sync_app_bytes" "$sync_wal_bytes" "$sync_gzip_bytes" "$post_app_bytes" "$post_wal_bytes" "$post_gzip_bytes" <<'PY' +import json +import sys +from pathlib import Path + +out_path = Path(sys.argv[1]) +pair_index = int(sys.argv[2]) +variant = sys.argv[3] +run_start = int(sys.argv[4]) +run_end = int(sys.argv[5]) +rewrite_attempted = int(sys.argv[6]) +rewrite_seconds = int(sys.argv[7]) +rewrite_rc = int(sys.argv[8]) +batch_write_ops = float(sys.argv[9]) +keep_dir = sys.argv[10] +s_sync_app = int(sys.argv[11]) +s_sync_wal = int(sys.argv[12]) +s_sync_gzip = int(sys.argv[13]) +s_post_app = int(sys.argv[14]) +s_post_wal = int(sys.argv[15]) +s_post_gzip = int(sys.argv[16]) + +t_sync = max(0, run_end - run_start) +t_rewrite = rewrite_seconds if rewrite_attempted == 1 else 0 +if rewrite_attempted == 1 and rewrite_rc != 0: + t_total = None +else: + t_total = t_sync + t_rewrite + +payload = { + "pair_index": pair_index, + "variant": variant, + "keep_dir": keep_dir, + "bench": { + "duration_seconds": t_sync, + "batch_write_ops_per_sec": batch_write_ops, + }, + "rewrite": { + "attempted": rewrite_attempted == 1, + "seconds": t_rewrite, + "exit_code": rewrite_rc, + }, + "sizes": { + "sync_app_bytes": s_sync_app, + "sync_wal_bytes": s_sync_wal, + "sync_gzip_bytes": s_sync_gzip, + "post_app_bytes": s_post_app, + "post_wal_bytes": s_post_wal, + "post_gzip_bytes": s_post_gzip, + }, + "metrics": { + "t_sync_seconds": t_sync, + "t_rewrite_seconds": t_rewrite, + "t_total_seconds": t_total, + "batch_write_ops_per_sec": batch_write_ops, + "s_sync_app_bytes": s_sync_app, + "s_sync_wal_bytes": s_sync_wal, + "s_sync_gzip_bytes": s_sync_gzip, + "s_post_app_bytes": s_post_app, + "s_post_wal_bytes": s_post_wal, + "s_post_gzip_bytes": s_post_gzip, + }, +} +out_path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8") +print(out_path) +PY + + if [[ "$KEEP_DB_DIRS" != "1" ]]; then + rm -rf "$keep_dir" + fi + + echo "run_id=$run_id keep_dir=$keep_dir json=$run_json" +} + +aggregate_and_decide() { + local decision_json="$OUT/decision.json" + python3 - "$OUT" "$SIZE_FIELD" "$SIZE_TOLERANCE_BYTES" "$TIME_TOLERANCE_SECONDS" "$MIN_PAIRS" "$CLEAR_WIN_PAIRS" "$CLEAR_LOSS_PAIRS" "$MAX_PAIRS" "$STOP_ON_CLEAR" "$LOW_SIGNAL_MIN_PAIRS" "$LOW_SIGNAL_NEUTRAL_STREAK" "$decision_json" <<'PY' +import csv +import json +import statistics +import sys +from pathlib import Path + +out = Path(sys.argv[1]) +size_field = sys.argv[2] +size_tol = int(sys.argv[3]) +time_tol = int(sys.argv[4]) +min_pairs = int(sys.argv[5]) +clear_win_pairs = int(sys.argv[6]) +clear_loss_pairs = int(sys.argv[7]) +max_pairs = int(sys.argv[8]) +stop_on_clear = sys.argv[9] == "1" +low_signal_min_pairs = int(sys.argv[10]) +low_signal_neutral_streak = int(sys.argv[11]) +decision_path = Path(sys.argv[12]) + +run_files = sorted(out.glob("runs/*/run.json")) +runs = [] +for p in run_files: + try: + runs.append(json.loads(p.read_text(encoding="utf-8"))) + except Exception: + continue +runs.sort(key=lambda r: (int(r.get("pair_index", 0)), str(r.get("variant", "")))) + +runs_csv = out / "runs.csv" +with runs_csv.open("w", newline="", encoding="utf-8") as fh: + w = csv.writer(fh) + w.writerow([ + "pair_index", + "variant", + "keep_dir", + "t_sync_seconds", + "t_rewrite_seconds", + "t_total_seconds", + "batch_write_ops_per_sec", + "s_sync_app_bytes", + "s_sync_wal_bytes", + "s_sync_gzip_bytes", + "s_post_app_bytes", + "s_post_wal_bytes", + "s_post_gzip_bytes", + "rewrite_exit_code", + ]) + for r in runs: + m = r.get("metrics", {}) or {} + rw = r.get("rewrite", {}) or {} + w.writerow([ + int(r.get("pair_index", 0)), + str(r.get("variant", "")), + str(r.get("keep_dir", "")), + m.get("t_sync_seconds"), + m.get("t_rewrite_seconds"), + m.get("t_total_seconds"), + m.get("batch_write_ops_per_sec"), + m.get("s_sync_app_bytes"), + m.get("s_sync_wal_bytes"), + m.get("s_sync_gzip_bytes"), + m.get("s_post_app_bytes"), + m.get("s_post_wal_bytes"), + m.get("s_post_gzip_bytes"), + rw.get("exit_code"), + ]) + +by_pair = {} +for r in runs: + pair = int(r.get("pair_index", 0)) + by_pair.setdefault(pair, {})[str(r.get("variant", ""))] = r + +def delta(a, b): + if a is None or b is None: + return None + try: + return a - b + except Exception: + return None + +pair_rows = [] +wins = 0 +losses = 0 +for pair in sorted(by_pair): + row = by_pair[pair] + ctrl = row.get("control") + cand = row.get("candidate") + if not ctrl or not cand: + continue + + cm = cand.get("metrics", {}) or {} + bm = ctrl.get("metrics", {}) or {} + + d_sync = delta(cm.get("t_sync_seconds"), bm.get("t_sync_seconds")) + d_total = delta(cm.get("t_total_seconds"), bm.get("t_total_seconds")) + d_bw = delta(cm.get("batch_write_ops_per_sec"), bm.get("batch_write_ops_per_sec")) + + d_sync_app = delta(cm.get("s_sync_app_bytes"), bm.get("s_sync_app_bytes")) + d_sync_wal = delta(cm.get("s_sync_wal_bytes"), bm.get("s_sync_wal_bytes")) + d_sync_gzip = delta(cm.get("s_sync_gzip_bytes"), bm.get("s_sync_gzip_bytes")) + d_post_app = delta(cm.get("s_post_app_bytes"), bm.get("s_post_app_bytes")) + d_post_wal = delta(cm.get("s_post_wal_bytes"), bm.get("s_post_wal_bytes")) + d_post_gzip = delta(cm.get("s_post_gzip_bytes"), bm.get("s_post_gzip_bytes")) + + d_size_primary = delta(cm.get(size_field), bm.get(size_field)) + + outcome = "neutral" + if d_size_primary is not None and d_total is not None: + win = (d_size_primary <= -size_tol) and (d_total <= time_tol) + loss = (d_size_primary >= size_tol) and (d_total >= -time_tol) + if win and not loss: + outcome = "win" + wins += 1 + elif loss and not win: + outcome = "loss" + losses += 1 + + pair_rows.append( + { + "pair_index": pair, + "delta_t_sync_seconds": d_sync, + "delta_t_total_seconds": d_total, + "delta_batch_write_ops_per_sec": d_bw, + "delta_s_sync_app_bytes": d_sync_app, + "delta_s_sync_wal_bytes": d_sync_wal, + "delta_s_sync_gzip_bytes": d_sync_gzip, + "delta_s_post_app_bytes": d_post_app, + "delta_s_post_wal_bytes": d_post_wal, + "delta_s_post_gzip_bytes": d_post_gzip, + "delta_size_primary_bytes": d_size_primary, + "outcome": outcome, + } + ) + +pairs_csv = out / "pairs.csv" +with pairs_csv.open("w", newline="", encoding="utf-8") as fh: + w = csv.writer(fh) + w.writerow( + [ + "pair_index", + "delta_t_sync_seconds", + "delta_t_total_seconds", + "delta_batch_write_ops_per_sec", + "delta_s_sync_app_bytes", + "delta_s_sync_wal_bytes", + "delta_s_sync_gzip_bytes", + "delta_s_post_app_bytes", + "delta_s_post_wal_bytes", + "delta_s_post_gzip_bytes", + "delta_size_primary_bytes", + "outcome", + ] + ) + for r in pair_rows: + w.writerow( + [ + r["pair_index"], + r["delta_t_sync_seconds"], + r["delta_t_total_seconds"], + r["delta_batch_write_ops_per_sec"], + r["delta_s_sync_app_bytes"], + r["delta_s_sync_wal_bytes"], + r["delta_s_sync_gzip_bytes"], + r["delta_s_post_app_bytes"], + r["delta_s_post_wal_bytes"], + r["delta_s_post_gzip_bytes"], + r["delta_size_primary_bytes"], + r["outcome"], + ] + ) + +completed_pairs = len(pair_rows) +neutral = max(0, completed_pairs - wins - losses) +neutral_streak = 0 +for row in reversed(pair_rows): + if row.get("outcome") == "neutral": + neutral_streak += 1 + continue + break + +reason = "continue" +stop = False +if stop_on_clear and completed_pairs >= min_pairs: + if wins >= clear_win_pairs and wins > losses: + stop = True + reason = "clear_improvement" + elif losses >= clear_loss_pairs and losses > wins: + stop = True + reason = "clear_regression" + else: + remaining = max(0, max_pairs - completed_pairs) + can_reach_clear_win = (wins + remaining) >= clear_win_pairs + can_reach_clear_loss = (losses + remaining) >= clear_loss_pairs + if not can_reach_clear_win and not can_reach_clear_loss: + stop = True + reason = "futile_remaining_pairs" + +if (not stop) and completed_pairs >= low_signal_min_pairs and neutral_streak >= low_signal_neutral_streak: + stop = True + reason = "low_signal_neutral_streak" + +if (not stop) and completed_pairs >= max_pairs: + stop = True + reason = "max_pairs" + +med_delta_size = None +med_delta_total = None +size_values = [r["delta_size_primary_bytes"] for r in pair_rows if r.get("delta_size_primary_bytes") is not None] +time_values = [r["delta_t_total_seconds"] for r in pair_rows if r.get("delta_t_total_seconds") is not None] +if size_values: + med_delta_size = statistics.median(size_values) +if time_values: + med_delta_total = statistics.median(time_values) + +summary_md = out / "summary.md" +lines = [] +lines.append("# celestia_fast_gate summary") +lines.append("") +lines.append(f"- completed pairs: `{completed_pairs}`") +lines.append(f"- wins/losses/neutral: `{wins}` / `{losses}` / `{neutral}`") +lines.append(f"- neutral streak (tail): `{neutral_streak}`") +lines.append(f"- size field: `{size_field}`") +lines.append(f"- size tolerance bytes: `{size_tol}`") +lines.append(f"- time tolerance seconds: `{time_tol}`") +lines.append(f"- low-signal min pairs: `{low_signal_min_pairs}`") +lines.append(f"- low-signal neutral streak: `{low_signal_neutral_streak}`") +lines.append(f"- median delta(size): `{med_delta_size}`") +lines.append(f"- median delta(time_total): `{med_delta_total}`") +lines.append(f"- decision: `{reason}`") +lines.append("") +lines.append("## Artifacts") +lines.append("") +lines.append(f"- runs csv: `{runs_csv}`") +lines.append(f"- pairs csv: `{pairs_csv}`") +lines.append(f"- per-run json: `{out / 'runs'}`") +summary_md.write_text("\n".join(lines) + "\n", encoding="utf-8") + +review_md = out / "process_review.md" +review = [] +review.append("# Fast Loop Review") +review.append("") +review.append("## Signal Check") +review.append("") +review.append(f"- completed_pairs={completed_pairs}") +review.append(f"- neutral_streak={neutral_streak}") +review.append(f"- reason={reason}") +if med_delta_size is not None: + review.append(f"- median_delta_size_bytes={int(med_delta_size)}") +if med_delta_total is not None: + review.append(f"- median_delta_time_seconds={int(med_delta_total)}") +review.append("") +review.append("## Suggested Next Action") +review.append("") +if reason in {"low_signal_neutral_streak", "futile_remaining_pairs"}: + review.append("- Stop long validation; this loop is currently low-signal for the configured tolerance.") + review.append("- Increase expected effect size (bundle larger code changes) or increase micro workload stress before re-running.") +elif reason == "clear_regression": + review.append("- Reject candidate as-is; run pprof on this fast gate to isolate removable overhead before retrying.") +elif reason == "clear_improvement": + review.append("- Promote candidate to run_celestia A/B confirmation.") +else: + review.append("- Continue collecting interleaved pairs until a clear outcome or low-signal stop triggers.") +review_md.write_text("\n".join(review) + "\n", encoding="utf-8") + +payload = { + "completed_pairs": completed_pairs, + "wins": wins, + "losses": losses, + "neutral": neutral, + "neutral_streak": neutral_streak, + "size_field": size_field, + "size_tolerance_bytes": size_tol, + "time_tolerance_seconds": time_tol, + "median_delta_size_bytes": med_delta_size, + "median_delta_time_seconds": med_delta_total, + "stop": stop, + "reason": reason, +} +decision_path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8") +print(json.dumps(payload, sort_keys=True)) +PY +} + +run_pair() { + local pair_index="$1" + if (( pair_index % 2 == 1 )); then + run_variant "$pair_index" "control" + sleep "$SLEEP_BETWEEN_RUNS_SECONDS" + run_variant "$pair_index" "candidate" + else + run_variant "$pair_index" "candidate" + sleep "$SLEEP_BETWEEN_RUNS_SECONDS" + run_variant "$pair_index" "control" + fi +} + +require_cmd git +require_cmd go +require_cmd python3 +require_cmd tar +require_cmd gzip +require_cmd wc + +if (( MAX_PAIRS < 1 )); then + echo "MAX_PAIRS must be >= 1" >&2 + exit 2 +fi + +mkdir -p "$OUT" +setup_bins + +cat >"$OUT/meta.txt" <= max_pairs: - stop = True - reason = "max_pairs" -elif stop_on_clear and completed_pairs >= min_pairs: +if stop_on_clear and completed_pairs >= min_pairs: if wins >= clear_win_pairs and wins > losses: stop = True reason = "clear_improvement" elif losses >= clear_loss_pairs and losses > wins: stop = True reason = "clear_regression" + else: + remaining = max(0, max_pairs - completed_pairs) + can_reach_clear_win = (wins + remaining) >= clear_win_pairs + can_reach_clear_loss = (losses + remaining) >= clear_loss_pairs + if not can_reach_clear_win and not can_reach_clear_loss: + stop = True + reason = "futile_remaining_pairs" + +if (not stop) and completed_pairs >= low_signal_min_pairs and neutral_streak >= low_signal_neutral_streak: + stop = True + reason = "low_signal_neutral_streak" + +if (not stop) and completed_pairs >= max_pairs: + stop = True + reason = "max_pairs" summary_md = out / "summary.md" lines = [] lines.append("# run_celestia A/B summary") lines.append("") lines.append(f"- completed pairs: `{completed_pairs}`") -lines.append(f"- wins/losses/neutral: `{wins}` / `{losses}` / `{max(0, completed_pairs - wins - losses)}`") +lines.append(f"- wins/losses/neutral: `{wins}` / `{losses}` / `{neutral}`") +lines.append(f"- neutral streak (tail): `{neutral_streak}`") lines.append(f"- size tolerance bytes: `{size_tol}`") lines.append(f"- time tolerance seconds: `{time_tol}`") +lines.append(f"- low-signal min pairs: `{low_signal_min_pairs}`") +lines.append(f"- low-signal neutral streak: `{low_signal_neutral_streak}`") lines.append(f"- decision: `{reason}`") lines.append("") lines.append("## Artifacts") @@ -461,7 +490,8 @@ payload = { "completed_pairs": completed_pairs, "wins": wins, "losses": losses, - "neutral": max(0, completed_pairs - wins - losses), + "neutral": neutral, + "neutral_streak": neutral_streak, "stop": stop, "reason": reason, } From 55a41d1fd7039ebb03760ce5d02297d4830cbde6 Mon Sep 17 00:00:00 2001 From: Mikers Date: Sat, 28 Mar 2026 17:41:39 -1000 Subject: [PATCH 43/61] treedb: add opt-in hot-debt-only checkpoint-kick gate --- TreeDB/caching/db.go | 47 ++++++-- .../caching/vlog_generation_scheduler_test.go | 104 ++++++++++++++++++ docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md | 5 + ...HECKPOINT_KICK_HOT_DEBT_ONLY_2026-03-28.md | 89 +++++++++++++++ scripts/analyze_vlog_maintenance_capacity.py | 8 ++ 5 files changed, 243 insertions(+), 10 deletions(-) create mode 100644 docs/benchmarks/CELESTIA_CHECKPOINT_KICK_HOT_DEBT_ONLY_2026-03-28.md diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go index 139119a46..1858c4810 100644 --- a/TreeDB/caching/db.go +++ b/TreeDB/caching/db.go @@ -1860,6 +1860,11 @@ const ( envDisableVlogGenerationVacuum = "TREEDB_DISABLE_VLOG_GENERATION_VACUUM" envDisableVlogGenerationLoop = "TREEDB_DISABLE_VLOG_GENERATION_LOOP" envDisableVlogGenerationCheckpointKick = "TREEDB_DISABLE_VLOG_GENERATION_CHECKPOINT_KICK" + // Experimental WAL-off checkpoint-kick guard: when enabled, avoid starting + // fresh rewrite planning during hot foreground activity. Queued rewrite debt + // (or deferred maintenance due) remains eligible so resumable progress is not + // starved. + envEnableVlogGenerationCheckpointKickHotDebtOnly = "TREEDB_ENABLE_VLOG_GENERATION_CHECKPOINT_KICK_HOT_DEBT_ONLY" // Experimental WAL-off override: allow rewrite planning/execution before the // first explicit checkpoint. Disabled by default because it can add restore // contention during early state-sync. @@ -5965,6 +5970,7 @@ type DB struct { vlogGenerationCheckpointKickRuns atomic.Uint64 vlogGenerationCheckpointKickRewriteRuns atomic.Uint64 vlogGenerationCheckpointKickGCRuns atomic.Uint64 + vlogGenerationCheckpointKickSkippedHotNoDebt atomic.Uint64 vlogGenerationCheckpointKickPending atomic.Bool vlogGenerationDeferredMaintenancePending atomic.Bool vlogGenerationDeferredMaintenanceRunning atomic.Bool @@ -15429,6 +15435,34 @@ func (db *DB) maybeKickVlogGenerationMaintenanceAfterCheckpoint() { return } now := time.Now() + rewriteDisabled := envBool(envDisableVlogGenerationRewrite) + rewriteQueueLen := 0 + if !rewriteDisabled { + rewriteQueue, qerr := db.currentVlogGenerationRewriteQueue() + if qerr != nil { + db.vlogGenerationSchedulerState.Store(vlogGenerationSchedulerError) + if db.notifyError != nil { + db.notifyError(fmt.Errorf("cachingdb: load generational rewrite queue for checkpoint kick: %w", qerr)) + } + return + } + rewriteQueueLen = len(rewriteQueue) + } + if envBool(envEnableVlogGenerationCheckpointKickHotDebtOnly) && !rewriteDisabled { + quiet := db.foregroundActivityQuietFor(now, vlogGenerationMaintenanceQuietWindow, vlogForegroundReadQuietWindow) + if !quiet && rewriteQueueLen == 0 && !db.vlogGenerationDeferredMaintenanceDue(now) { + db.vlogGenerationCheckpointKickSkippedHotNoDebt.Add(1) + db.debugVlogMaintf( + "checkpoint_kick_skip reason=foreground_hot_no_debt quiet=%t queue_len=%d checkpoint_pending=%t deferred_pending=%t deferred_due=%t", + quiet, + rewriteQueueLen, + db.vlogGenerationCheckpointKickPending.Load(), + db.vlogGenerationDeferredMaintenancePending.Load(), + db.vlogGenerationDeferredMaintenanceDue(now), + ) + return + } + } last := db.vlogGenerationLastCheckpointKickUnixNano.Load() if last > 0 && now.Sub(time.Unix(0, last)) < vlogGenerationCheckpointKickMinInterval { db.debugVlogMaintf( @@ -15440,16 +15474,8 @@ func (db *DB) maybeKickVlogGenerationMaintenanceAfterCheckpoint() { } // Avoid forcing extra checkpoint boundaries when rewrite is clearly ineligible. // Skip this fast-path when rewrite is disabled so GC-only kicks still run. - if !envBool(envDisableVlogGenerationRewrite) { - rewriteQueue, qerr := db.currentVlogGenerationRewriteQueue() - if qerr != nil { - db.vlogGenerationSchedulerState.Store(vlogGenerationSchedulerError) - if db.notifyError != nil { - db.notifyError(fmt.Errorf("cachingdb: load generational rewrite queue for checkpoint kick: %w", qerr)) - } - return - } - if len(rewriteQueue) == 0 { + if !rewriteDisabled { + if rewriteQueueLen == 0 { if trigger := db.valueLogRewriteTriggerBytes; trigger > 0 { retained, bytes := db.valueLogRetainedStats() if bytes < trigger && retained < 2 { @@ -20987,6 +21013,7 @@ func (db *DB) Stats() map[string]string { stats["treedb.cache.vlog_generation.checkpoint_kick.runs"] = fmt.Sprintf("%d", db.vlogGenerationCheckpointKickRuns.Load()) stats["treedb.cache.vlog_generation.checkpoint_kick.rewrite_runs"] = fmt.Sprintf("%d", db.vlogGenerationCheckpointKickRewriteRuns.Load()) stats["treedb.cache.vlog_generation.checkpoint_kick.gc_runs"] = fmt.Sprintf("%d", db.vlogGenerationCheckpointKickGCRuns.Load()) + stats["treedb.cache.vlog_generation.checkpoint_kick.skipped_hot_no_debt"] = fmt.Sprintf("%d", db.vlogGenerationCheckpointKickSkippedHotNoDebt.Load()) stats["treedb.cache.vlog_generation.maintenance.attempts"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceAttempts.Load()) stats["treedb.cache.vlog_generation.maintenance.acquired"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceAcquired.Load()) stats["treedb.cache.vlog_generation.maintenance.collisions"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceCollisions.Load()) diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go index f25b8b4dd..61f4818ff 100644 --- a/TreeDB/caching/vlog_generation_scheduler_test.go +++ b/TreeDB/caching/vlog_generation_scheduler_test.go @@ -4324,6 +4324,52 @@ func TestCheckpoint_KicksVlogGenerationRewriteDespiteRecentForegroundActivity(t } } +func TestCheckpoint_KickHotDebtOnlySkipsFreshPlanDuringRecentForegroundActivity(t *testing.T) { + disableVlogGenerationLoop(t) + t.Setenv(envEnableVlogGenerationCheckpointKickHotDebtOnly, "1") + + dir := t.TempDir() + + backend, err := backenddb.Open(backenddb.Options{Dir: dir}) + if err != nil { + t.Fatalf("open backend: %v", err) + } + recorder := &rewriteBudgetRecordingBackend{ + DB: backend, + planResponse: backenddb.ValueLogRewritePlan{ + SourceFileIDs: []uint32{11}, + SelectedBytesLive: 128, + }, + rewriteResponse: backenddb.ValueLogRewriteStats{BytesBefore: 64, BytesAfter: 32, RecordsCopied: 1}, + } + + db, cleanup := openRewriteQueueTestDB(t, dir, recorder) + t.Cleanup(cleanup) + skipRetainedPrune(db) + db.testSkipVlogCheckpointKick = false + + hot := time.Now().UnixNano() + db.lastForegroundWriteUnixNano.Store(hot) + db.lastForegroundReadUnixNano.Store(hot) + + db.maybeKickVlogGenerationMaintenanceAfterCheckpoint() + + time.Sleep(150 * time.Millisecond) + if _, calls := recorder.recordedPlan(); calls != 0 { + t.Fatalf("plan calls=%d want 0", calls) + } + if _, calls := recorder.recordedRewrite(); calls != 0 { + t.Fatalf("rewrite calls=%d want 0", calls) + } + stats := db.Stats() + if got := stats["treedb.cache.vlog_generation.checkpoint_kick.runs"]; got != "0" { + t.Fatalf("checkpoint kick runs=%q want 0", got) + } + if got := stats["treedb.cache.vlog_generation.checkpoint_kick.skipped_hot_no_debt"]; got != "1" { + t.Fatalf("checkpoint kick skipped_hot_no_debt=%q want 1", got) + } +} + func TestCheckpoint_DoesNotKickVlogGenerationRewrite_WALOn(t *testing.T) { disableVlogGenerationLoop(t) @@ -4431,6 +4477,64 @@ func TestCheckpoint_KicksQueuedRewriteDebtBelowTriggerFloor(t *testing.T) { time.Sleep(10 * time.Millisecond) } + stats := db.Stats() + if got := stats["treedb.cache.vlog_generation.checkpoint_kick.runs"]; got != "1" { + t.Fatalf("checkpoint kick runs=%q want 1", got) + } + if got := stats["treedb.cache.vlog_generation.checkpoint_kick.rewrite_runs"]; got != "1" { + t.Fatalf("checkpoint kick rewrite runs=%q want 1", got) + } + if got := stats["treedb.cache.vlog_generation.checkpoint_kick.skipped_hot_no_debt"]; got != "0" { + t.Fatalf("checkpoint kick skipped_hot_no_debt=%q want 0", got) + } +} + +func TestCheckpoint_KickHotDebtOnlyStillRunsQueuedRewriteDebtDuringRecentForegroundActivity(t *testing.T) { + disableVlogGenerationLoop(t) + t.Setenv(envEnableVlogGenerationCheckpointKickHotDebtOnly, "1") + + dir := t.TempDir() + + backend, err := backenddb.Open(backenddb.Options{Dir: dir}) + if err != nil { + t.Fatalf("open backend: %v", err) + } + recorder := &rewriteBudgetRecordingBackend{ + DB: backend, + planResponse: backenddb.ValueLogRewritePlan{ + SourceFileIDs: []uint32{11}, + SelectedBytesLive: 128, + }, + rewriteResponse: backenddb.ValueLogRewriteStats{BytesBefore: 64, BytesAfter: 32, RecordsCopied: 1}, + } + + db, cleanup := openRewriteQueueTestDB(t, dir, recorder) + t.Cleanup(cleanup) + skipRetainedPrune(db) + db.testSkipVlogCheckpointKick = false + db.valueLogRewriteTriggerBytes = 1 << 30 + if err := db.setVlogGenerationRewriteQueue([]uint32{11}); err != nil { + t.Fatalf("seed rewrite queue: %v", err) + } + db.vlogGenerationRewriteBudgetTokensBytes.Store(1024) + hot := time.Now().UnixNano() + db.lastForegroundWriteUnixNano.Store(hot) + db.lastForegroundReadUnixNano.Store(hot) + + db.maybeKickVlogGenerationMaintenanceAfterCheckpoint() + + deadline := time.Now().Add(2 * schedulerTestWait(t)) + for { + if _, calls := recorder.recordedRewrite(); calls == 1 { + break + } + if time.Now().After(deadline) { + _, rewriteCalls := recorder.recordedRewrite() + t.Fatalf("checkpoint kick with queued debt did not run rewrite in time: rewriteCalls=%d", rewriteCalls) + } + time.Sleep(10 * time.Millisecond) + } + stats := db.Stats() if got := stats["treedb.cache.vlog_generation.checkpoint_kick.runs"]; got != "1" { t.Fatalf("checkpoint kick runs=%q want 1", got) diff --git a/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md b/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md index 6f5d83aa0..249a2b753 100644 --- a/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md +++ b/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md @@ -103,6 +103,11 @@ The harness alternates run order per pair (`control->candidate`, then - Allows rewrite planning/execution before the first explicit checkpoint. - Default is disabled to avoid adding early restore contention. - Use for controlled `run_celestia` experiments when `maintenance.skip.before_first_checkpoint` dominates and live rewrite never starts. +- `TREEDB_ENABLE_VLOG_GENERATION_CHECKPOINT_KICK_HOT_DEBT_ONLY=1` + - WAL-off only. + - During checkpoint-kick maintenance, skips starting a fresh rewrite plan while foreground activity is hot and rewrite queue debt is empty. + - Still allows queued rewrite debt (and deferred-due passes) to run. + - Default is disabled. ## Bench Commands ### Churn sanity (TreeDB) diff --git a/docs/benchmarks/CELESTIA_CHECKPOINT_KICK_HOT_DEBT_ONLY_2026-03-28.md b/docs/benchmarks/CELESTIA_CHECKPOINT_KICK_HOT_DEBT_ONLY_2026-03-28.md new file mode 100644 index 000000000..2e40ec607 --- /dev/null +++ b/docs/benchmarks/CELESTIA_CHECKPOINT_KICK_HOT_DEBT_ONLY_2026-03-28.md @@ -0,0 +1,89 @@ +# Celestia: Checkpoint-Kick Hot-Debt-Only Gate (2026-03-28) + +## Goal +Reduce `run_celestia` sync wall-time regression from live value-log maintenance while preserving on-disk size gains. + +## Change Under Test +Candidate enables: + +- `TREEDB_ENABLE_VLOG_GENERATION_CHECKPOINT_KICK_HOT_DEBT_ONLY=1` + +Behavior: + +- In WAL-off checkpoint-kick path, if foreground is hot and rewrite queue is empty, skip starting a fresh rewrite plan. +- Queued rewrite debt and deferred-due maintenance still run. +- Default behavior remains unchanged unless this env flag is set. + +## Commands +Both campaigns used fixed trust/target and a single interleaved pair (`MAX_PAIRS=1`) with offline rewrite enabled. + +Common env (both variants): + +- `TREEDB_OPEN_PROFILE=fast` +- `POLL_INTERVAL_SECONDS=1` +- `FREEZE_REMOTE_HEIGHT_AT_START=1` +- `ALLOW_CLAMPED_TARGET_EARLY_EXIT=1` +- `STOP_AT_LOCAL_HEIGHT=` +- `TRUST_HEIGHT=` +- `TRUST_HASH=` + +Variant-specific env: + +- `main`: `LOCAL_GOMAP_DIR=/tmp/gomap_ab_base_20260328162444` +- `hot_debt_only`: `LOCAL_GOMAP_DIR=/home/mikers/dev/snissn/gomap-phasehook-active` + `TREEDB_ENABLE_VLOG_GENERATION_CHECKPOINT_KICK_HOT_DEBT_ONLY=1` + +Harness: + +```bash +OUT_DIR= \ +CONTROL_ENV_FILE= \ +CANDIDATE_ENV_FILE= \ +MAX_PAIRS=1 MIN_PAIRS=1 CLEAR_WIN_PAIRS=1 CLEAR_LOSS_PAIRS=1 \ +LOW_SIGNAL_MIN_PAIRS=1 LOW_SIGNAL_NEUTRAL_STREAK=1 \ +SIZE_TOLERANCE_BYTES=$((64<<20)) TIME_TOLERANCE_SECONDS=120 \ +REWRITE_ENABLED=1 \ +./scripts/run_celestia_ab.sh +``` + +## Runs +- control=main, candidate=hot_debt_only: + - `/tmp/celestia_ab_hotdebt_20260328171204` +- control=hot_debt_only, candidate=main (swapped to counter order bias): + - `/tmp/celestia_ab_hotdebt_swapped_20260328172453` + +## Normalized Results (hot_debt_only - main) +- Run A (hot_debt_only as candidate): + - `delta_t_sync_seconds = -16` + - `delta_t_total_seconds = -17` + - `delta_s_sync_app_bytes = -694,418,294` + - `delta_s_post_wal_bytes = +3,315,722` +- Run B (hot_debt_only as control, normalized): + - `delta_t_sync_seconds = +3` + - `delta_t_total_seconds = +2` + - `delta_s_sync_app_bytes = -98,696,592` + - `delta_s_post_wal_bytes = -3,665,002` + +Two-run median/average (same with n=2): + +- `delta_t_sync_seconds = -6.5s` +- `delta_t_total_seconds = -7.5s` +- `delta_s_sync_app_bytes = -396,557,443B` (~`-378.2 MiB`) +- `delta_s_post_wal_bytes = -174,640B` (~`-170.5 KiB`, effectively neutral) + +## Maintenance Counters +Across both runs, both variants showed: + +- `rewrite_runs=0` +- `checkpoint_kick_runs=0` + +Candidate (`hot_debt_only`) showed one lightweight GC pass in each run (`gc_runs=1`), with no rewrite execution. + +## Takeaway +The hot-debt-only gate removed checkpoint-kick rewrite pressure during hot sync windows and improved sync+rewrite wall time in this small sample, while keeping pre-rewrite app size better than main and post-rewrite WAL roughly neutral. + +## Next Step +Run an interleaved sequence with more pairs (stop-on-significance) and include the new stat key: + +- `treedb.cache.vlog_generation.checkpoint_kick.skipped_hot_no_debt` + +to confirm skip path activation frequency under full mainnet sync pressure. diff --git a/scripts/analyze_vlog_maintenance_capacity.py b/scripts/analyze_vlog_maintenance_capacity.py index 292a8a9e8..b5b6eabc0 100755 --- a/scripts/analyze_vlog_maintenance_capacity.py +++ b/scripts/analyze_vlog_maintenance_capacity.py @@ -320,6 +320,7 @@ def build_summary(stats: dict[str, Any]) -> dict[str, Any]: "checkpoint_kick_runs": metric_int(stats, "treedb.cache.vlog_generation.checkpoint_kick.runs"), "checkpoint_kick_gc_runs": metric_int(stats, "treedb.cache.vlog_generation.checkpoint_kick.gc_runs"), "checkpoint_kick_rewrite_runs": metric_int(stats, "treedb.cache.vlog_generation.checkpoint_kick.rewrite_runs"), + "checkpoint_kick_skipped_hot_no_debt": metric_int(stats, "treedb.cache.vlog_generation.checkpoint_kick.skipped_hot_no_debt"), } skip_keys = [ @@ -505,6 +506,13 @@ def print_report(summary: dict[str, Any], source_file: Path, run_home: str, inst f"priority={skips['priority_pending']} " f"checkpoint={skips['checkpoint_inflight']}" ) + print( + " checkpoint-kick: " + f"runs={summary['checkpoint_kick_runs']} " + f"rewrite_runs={summary['checkpoint_kick_rewrite_runs']} " + f"gc_runs={summary['checkpoint_kick_gc_runs']} " + f"skipped_hot_no_debt={summary['checkpoint_kick_skipped_hot_no_debt']}" + ) print("") print("Rewrite economics") From 53d39b6bfbebc74636f878d4b2d20674ac3115bc Mon Sep 17 00:00:00 2001 From: Mikers Date: Sat, 28 Mar 2026 18:31:28 -1000 Subject: [PATCH 44/61] compression: reuse encoder and scratch in k-profile eval --- TreeDB/internal/compression/profile.go | 52 +++++++++++-- TreeDB/internal/compression/profile_test.go | 84 +++++++++++++++++++++ 2 files changed, 131 insertions(+), 5 deletions(-) create mode 100644 TreeDB/internal/compression/profile_test.go diff --git a/TreeDB/internal/compression/profile.go b/TreeDB/internal/compression/profile.go index 0114c98e6..cf0c95745 100644 --- a/TreeDB/internal/compression/profile.go +++ b/TreeDB/internal/compression/profile.go @@ -79,6 +79,21 @@ func ChooseKForDictOptions(dict []byte, samples [][]byte, opts ChooseKOptions) ( ks = []int{1, 2, 4, 8, 16, 32} } ks = normalizeCandidateK(ks) + var sharedEnc *zstd.Encoder + if dict != nil { + if enc, err := zstd.NewWriter(nil, zstd.WithEncoderDict(dict), zstd.WithEncoderLevel(zstd.SpeedFastest)); err == nil { + sharedEnc = enc + } + } else { + if enc, err := zstd.NewWriter(nil, zstd.WithEncoderLevel(zstd.SpeedFastest)); err == nil { + sharedEnc = enc + } + } + if sharedEnc != nil { + defer sharedEnc.Close() + } + var concatScratch []byte + var encodedScratch []byte scores := make([]kScore, 0, len(ks)) var baseline kScore for _, k := range ks { @@ -89,7 +104,12 @@ func ChooseKForDictOptions(dict []byte, samples [][]byte, opts ChooseKOptions) ( if used == 0 { continue } - payload, meta, raw, encodeNs := batchTotals(dict, eval[:used], k, opts.EncodeNsPerRawByte) + payload, meta, raw, encodeNs := 0, 0, 0, int64(0) + if sharedEnc != nil { + payload, meta, raw, encodeNs = batchTotalsWithEncoder(sharedEnc, eval[:used], k, opts.EncodeNsPerRawByte, &concatScratch, &encodedScratch) + } else { + payload, meta, raw, encodeNs = batchTotals(dict, eval[:used], k, opts.EncodeNsPerRawByte) + } if raw == 0 { continue } @@ -200,7 +220,6 @@ func batchTotals(dict []byte, samples [][]byte, k int, encodeNsPerRawByte float6 return 0, 0, 0, 0 } samples = samples[:n] - batches := n / k var enc *zstd.Encoder var err error if dict != nil { @@ -212,6 +231,23 @@ func batchTotals(dict []byte, samples [][]byte, k int, encodeNsPerRawByte float6 return 0, 0, 0, 0 } defer enc.Close() + var concatScratch []byte + var encodedScratch []byte + return batchTotalsWithEncoder(enc, samples, k, encodeNsPerRawByte, &concatScratch, &encodedScratch) +} + +func batchTotalsWithEncoder(enc *zstd.Encoder, samples [][]byte, k int, encodeNsPerRawByte float64, concatScratch *[]byte, encodedScratch *[]byte) (payload int, meta int, raw int, encodeNs int64) { + if enc == nil || k <= 0 { + return 0, 0, 0, 0 + } + n := (len(samples) / k) * k + if n == 0 { + return 0, 0, 0, 0 + } + samples = samples[:n] + batches := n / k + buf := *concatScratch + encoded := *encodedScratch started := time.Now() for b := 0; b < batches; b++ { start := b * k @@ -221,14 +257,18 @@ func batchTotals(dict []byte, samples [][]byte, k int, encodeNsPerRawByte float6 raw += len(samples[i]) total += len(samples[i]) } - buf := make([]byte, total) + if cap(buf) < total { + buf = make([]byte, total) + } else { + buf = buf[:total] + } pos := 0 for i := start; i < end; i++ { copy(buf[pos:], samples[i]) pos += len(samples[i]) } - c := enc.EncodeAll(buf, nil) - payload += len(c) + encoded = enc.EncodeAll(buf, encoded[:0]) + payload += len(encoded) // Account for the full on-disk framing overhead: // - record header (CRC/version/flags/txn/bodyLen) // - frame header + dict_id + RID table + offsets table @@ -245,6 +285,8 @@ func batchTotals(dict []byte, samples [][]byte, k int, encodeNsPerRawByte float6 } else { encodeNs = time.Since(started).Nanoseconds() } + *concatScratch = buf[:0] + *encodedScratch = encoded[:0] return payload, meta, raw, encodeNs } diff --git a/TreeDB/internal/compression/profile_test.go b/TreeDB/internal/compression/profile_test.go new file mode 100644 index 000000000..15dbd18d2 --- /dev/null +++ b/TreeDB/internal/compression/profile_test.go @@ -0,0 +1,84 @@ +package compression + +import ( + "bytes" + "encoding/binary" + "testing" + + "github.com/snissn/compress/zstd" +) + +func buildProfileSamples(n int) [][]byte { + samples := make([][]byte, 0, n) + base := bytes.Repeat([]byte("compressible-"), 64) + for i := 0; i < n; i++ { + buf := make([]byte, 1024) + copy(buf, base) + binary.LittleEndian.PutUint32(buf[len(buf)-4:], uint32(i)) + samples = append(samples, buf) + } + return samples +} + +func mustBuildValidDict(t *testing.T, samples [][]byte) []byte { + t.Helper() + history := make([]byte, 0, 1<<16) + for _, s := range samples { + history = append(history, s...) + } + dict, err := buildAndValidateDict(42, samples, history, zstd.SpeedFastest) + if err != nil { + t.Fatalf("build dict: %v", err) + } + if len(dict) == 0 { + t.Fatalf("expected non-empty dict") + } + return dict +} + +func TestBatchTotalsWithEncoder_MatchesBatchTotals_NoDict(t *testing.T) { + samples := buildProfileSamples(16) + encodeNsPerRawByte := 1.25 + + for _, k := range []int{1, 2, 4, 8} { + wantPayload, wantMeta, wantRaw, wantEncodeNS := batchTotals(nil, samples, k, encodeNsPerRawByte) + enc, err := zstd.NewWriter(nil, zstd.WithEncoderLevel(zstd.SpeedFastest)) + if err != nil { + t.Fatalf("new writer: %v", err) + } + + var concatScratch []byte + var encodedScratch []byte + gotPayload, gotMeta, gotRaw, gotEncodeNS := batchTotalsWithEncoder(enc, samples, k, encodeNsPerRawByte, &concatScratch, &encodedScratch) + _ = enc.Close() + + if gotPayload != wantPayload || gotMeta != wantMeta || gotRaw != wantRaw || gotEncodeNS != wantEncodeNS { + t.Fatalf("k=%d mismatch got=(payload=%d meta=%d raw=%d encodeNs=%d) want=(payload=%d meta=%d raw=%d encodeNs=%d)", + k, gotPayload, gotMeta, gotRaw, gotEncodeNS, wantPayload, wantMeta, wantRaw, wantEncodeNS) + } + } +} + +func TestBatchTotalsWithEncoder_MatchesBatchTotals_WithDict(t *testing.T) { + samples := buildProfileSamples(256) + dict := mustBuildValidDict(t, samples) + encodeNsPerRawByte := 2.0 + + for _, k := range []int{1, 2, 3, 6} { + wantPayload, wantMeta, wantRaw, wantEncodeNS := batchTotals(dict, samples, k, encodeNsPerRawByte) + enc, err := zstd.NewWriter(nil, zstd.WithEncoderDict(dict), zstd.WithEncoderLevel(zstd.SpeedFastest)) + if err != nil { + t.Fatalf("new dict writer: %v", err) + } + + var concatScratch []byte + var encodedScratch []byte + gotPayload, gotMeta, gotRaw, gotEncodeNS := batchTotalsWithEncoder(enc, samples, k, encodeNsPerRawByte, &concatScratch, &encodedScratch) + _ = enc.Close() + + if gotPayload != wantPayload || gotMeta != wantMeta || gotRaw != wantRaw || gotEncodeNS != wantEncodeNS { + t.Fatalf("k=%d mismatch got=(payload=%d meta=%d raw=%d encodeNs=%d) want=(payload=%d meta=%d raw=%d encodeNs=%d)", + k, gotPayload, gotMeta, gotRaw, gotEncodeNS, wantPayload, wantMeta, wantRaw, wantEncodeNS) + } + } +} From 6aaca742a729d7a2f17700a8a21adcb3fe35b04b Mon Sep 17 00:00:00 2001 From: Mikers Date: Sat, 28 Mar 2026 18:46:42 -1000 Subject: [PATCH 45/61] compression: reduce dict autotune encoder overhead --- TreeDB/internal/compression/profile.go | 91 +++++++++++++++++---- TreeDB/internal/compression/profile_test.go | 13 ++- TreeDB/internal/compression/trainer.go | 14 +++- 3 files changed, 97 insertions(+), 21 deletions(-) diff --git a/TreeDB/internal/compression/profile.go b/TreeDB/internal/compression/profile.go index cf0c95745..2e5e705a3 100644 --- a/TreeDB/internal/compression/profile.go +++ b/TreeDB/internal/compression/profile.go @@ -44,6 +44,13 @@ type kScore struct { score float64 } +const ( + // Bound evaluation work so training cost stays predictable on long streams. + // Use even down-sampling rather than prefix truncation to preserve shape. + maxChooseKEvalSamples = 4096 + maxDecodeCostSamples = 256 +) + func ChooseKForDict(dict []byte, samples [][]byte) (profile *ActiveProfile) { return ChooseKForDictOptions(dict, samples, ChooseKOptions{}) } @@ -59,8 +66,8 @@ func ChooseKForDictOptions(dict []byte, samples [][]byte, opts ChooseKOptions) ( return nil } eval := samples - if len(eval) > 10000 { - eval = eval[:10000] + if len(eval) > maxChooseKEvalSamples { + eval = evenlySampleRecords(eval, maxChooseKEvalSamples) } rawTotal := 0 for _, v := range eval { @@ -70,9 +77,9 @@ func ChooseKForDictOptions(dict []byte, samples [][]byte, opts ChooseKOptions) ( return nil } - nsPerByte := decodeCostEstimate(dict, eval) - if opts.DecodeNsPerRawByte > 0 { - nsPerByte = opts.DecodeNsPerRawByte + nsPerByte := opts.DecodeNsPerRawByte + if nsPerByte <= 0 { + nsPerByte = decodeCostEstimate(dict, eval) } ks := opts.CandidateK if len(ks) == 0 { @@ -81,11 +88,20 @@ func ChooseKForDictOptions(dict []byte, samples [][]byte, opts ChooseKOptions) ( ks = normalizeCandidateK(ks) var sharedEnc *zstd.Encoder if dict != nil { - if enc, err := zstd.NewWriter(nil, zstd.WithEncoderDict(dict), zstd.WithEncoderLevel(zstd.SpeedFastest)); err == nil { + if enc, err := zstd.NewWriter(nil, + zstd.WithEncoderDict(dict), + zstd.WithEncoderLevel(zstd.SpeedFastest), + zstd.WithEncoderConcurrency(1), + zstd.WithEncoderCRC(false), + ); err == nil { sharedEnc = enc } } else { - if enc, err := zstd.NewWriter(nil, zstd.WithEncoderLevel(zstd.SpeedFastest)); err == nil { + if enc, err := zstd.NewWriter(nil, + zstd.WithEncoderLevel(zstd.SpeedFastest), + zstd.WithEncoderConcurrency(1), + zstd.WithEncoderCRC(false), + ); err == nil { sharedEnc = enc } } @@ -223,9 +239,18 @@ func batchTotals(dict []byte, samples [][]byte, k int, encodeNsPerRawByte float6 var enc *zstd.Encoder var err error if dict != nil { - enc, err = zstd.NewWriter(nil, zstd.WithEncoderDict(dict), zstd.WithEncoderLevel(zstd.SpeedFastest)) + enc, err = zstd.NewWriter(nil, + zstd.WithEncoderDict(dict), + zstd.WithEncoderLevel(zstd.SpeedFastest), + zstd.WithEncoderConcurrency(1), + zstd.WithEncoderCRC(false), + ) } else { - enc, err = zstd.NewWriter(nil, zstd.WithEncoderLevel(zstd.SpeedFastest)) + enc, err = zstd.NewWriter(nil, + zstd.WithEncoderLevel(zstd.SpeedFastest), + zstd.WithEncoderConcurrency(1), + zstd.WithEncoderCRC(false), + ) } if err != nil { return 0, 0, 0, 0 @@ -291,30 +316,39 @@ func batchTotalsWithEncoder(enc *zstd.Encoder, samples [][]byte, k int, encodeNs } func decodeCostEstimate(dict []byte, samples [][]byte) float64 { - n := len(samples) - if n > 500 { - n = 500 + eval := samples + if len(eval) > maxDecodeCostSamples { + eval = evenlySampleRecords(eval, maxDecodeCostSamples) } - enc, err := zstd.NewWriter(nil, zstd.WithEncoderDict(dict), zstd.WithEncoderLevel(zstd.SpeedFastest)) + n := len(eval) + enc, err := zstd.NewWriter(nil, + zstd.WithEncoderDict(dict), + zstd.WithEncoderLevel(zstd.SpeedFastest), + zstd.WithEncoderConcurrency(1), + zstd.WithEncoderCRC(false), + ) if err != nil { return 1.0 } defer enc.Close() - frames := make([][]byte, n) + totalRaw := 0 + var encoded []byte for i := 0; i < n; i++ { - totalRaw += len(samples[i]) - frames[i] = enc.EncodeAll(samples[i], nil) + totalRaw += len(eval[i]) + encoded = enc.EncodeAll(eval[i], encoded[:0]) } dec, err := zstd.NewReader(nil, zstd.WithDecoderDicts(dict)) if err != nil { return 1.0 } defer dec.Close() + var out []byte start := time.Now() for i := 0; i < n; i++ { - out, _ = dec.DecodeAll(frames[i], out[:0]) + encoded = enc.EncodeAll(eval[i], encoded[:0]) + out, _ = dec.DecodeAll(encoded, out[:0]) if len(out) > 0 { _ = out[0] } @@ -325,3 +359,26 @@ func decodeCostEstimate(dict []byte, samples [][]byte) float64 { } return float64(elapsed.Nanoseconds()) / float64(totalRaw) } + +func evenlySampleRecords(samples [][]byte, limit int) [][]byte { + if limit <= 0 || len(samples) <= limit { + return samples + } + out := make([][]byte, 0, limit) + last := -1 + for i := 0; i < limit; i++ { + idx := (i * len(samples)) / limit + if idx >= len(samples) { + idx = len(samples) - 1 + } + if idx <= last { + idx = last + 1 + if idx >= len(samples) { + idx = len(samples) - 1 + } + } + last = idx + out = append(out, samples[idx]) + } + return out +} diff --git a/TreeDB/internal/compression/profile_test.go b/TreeDB/internal/compression/profile_test.go index 15dbd18d2..1c1c26136 100644 --- a/TreeDB/internal/compression/profile_test.go +++ b/TreeDB/internal/compression/profile_test.go @@ -42,7 +42,11 @@ func TestBatchTotalsWithEncoder_MatchesBatchTotals_NoDict(t *testing.T) { for _, k := range []int{1, 2, 4, 8} { wantPayload, wantMeta, wantRaw, wantEncodeNS := batchTotals(nil, samples, k, encodeNsPerRawByte) - enc, err := zstd.NewWriter(nil, zstd.WithEncoderLevel(zstd.SpeedFastest)) + enc, err := zstd.NewWriter(nil, + zstd.WithEncoderLevel(zstd.SpeedFastest), + zstd.WithEncoderConcurrency(1), + zstd.WithEncoderCRC(false), + ) if err != nil { t.Fatalf("new writer: %v", err) } @@ -66,7 +70,12 @@ func TestBatchTotalsWithEncoder_MatchesBatchTotals_WithDict(t *testing.T) { for _, k := range []int{1, 2, 3, 6} { wantPayload, wantMeta, wantRaw, wantEncodeNS := batchTotals(dict, samples, k, encodeNsPerRawByte) - enc, err := zstd.NewWriter(nil, zstd.WithEncoderDict(dict), zstd.WithEncoderLevel(zstd.SpeedFastest)) + enc, err := zstd.NewWriter(nil, + zstd.WithEncoderDict(dict), + zstd.WithEncoderLevel(zstd.SpeedFastest), + zstd.WithEncoderConcurrency(1), + zstd.WithEncoderCRC(false), + ) if err != nil { t.Fatalf("new dict writer: %v", err) } diff --git a/TreeDB/internal/compression/trainer.go b/TreeDB/internal/compression/trainer.go index 31c004bbc..ee525935c 100644 --- a/TreeDB/internal/compression/trainer.go +++ b/TreeDB/internal/compression/trainer.go @@ -838,7 +838,12 @@ func (t *Trainer) train(samples [][]byte, dictBytes int, level zstd.EncoderLevel } } - enc, err := zstd.NewWriter(nil, zstd.WithEncoderLevel(level), zstd.WithEncoderCRC(false), zstd.WithEncoderDict(bestProfile.Dict)) + enc, err := zstd.NewWriter(nil, + zstd.WithEncoderLevel(level), + zstd.WithEncoderCRC(false), + zstd.WithEncoderConcurrency(1), + zstd.WithEncoderDict(bestProfile.Dict), + ) if err != nil { log.Printf("treedb: dict training encode setup failed stream=%d err=%v", slabID, err) return @@ -930,7 +935,12 @@ func shapeAndValidateDict(dict []byte, dictBytes int, level zstd.EncoderLevel) ( } func validateDict(dict []byte, level zstd.EncoderLevel) error { - enc, err := zstd.NewWriter(nil, zstd.WithEncoderLevel(level), zstd.WithEncoderCRC(false), zstd.WithEncoderDict(dict)) + enc, err := zstd.NewWriter(nil, + zstd.WithEncoderLevel(level), + zstd.WithEncoderCRC(false), + zstd.WithEncoderConcurrency(1), + zstd.WithEncoderDict(dict), + ) if err != nil { return err } From 87c345ae65b125d5952c2374b5352dc570f4bcf2 Mon Sep 17 00:00:00 2001 From: Mikers Date: Sat, 28 Mar 2026 18:53:00 -1000 Subject: [PATCH 46/61] caching: add rewrite efficiency and budget rate stats --- TreeDB/caching/db.go | 61 +++++++++++++++++-- .../caching/vlog_generation_scheduler_test.go | 32 ++++++++++ cmd/unified_bench/main.go | 9 +++ 3 files changed, 96 insertions(+), 6 deletions(-) diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go index 1858c4810..ea4482b18 100644 --- a/TreeDB/caching/db.go +++ b/TreeDB/caching/db.go @@ -20937,6 +20937,46 @@ func (db *DB) Stats() map[string]string { rewriteExecTotalNS := db.vlogGenerationRewriteExecTotalNanos.Load() rewriteExecMaxNS := db.vlogGenerationRewriteExecMaxNanos.Load() rewriteRuns := db.vlogGenerationRewriteRuns.Load() + rewriteBytesInTotal := db.vlogGenerationRewriteBytesIn.Load() + rewriteBytesOutTotal := db.vlogGenerationRewriteBytesOut.Load() + rewriteReclaimedBytesTotal := db.vlogGenerationRewriteReclaimedBytes.Load() + rewriteProcessedLiveBytes := db.vlogGenerationRewriteProcessedLiveBytes.Load() + rewriteProcessedStaleBytes := db.vlogGenerationRewriteProcessedStaleBytes.Load() + rewriteProcessedTotal := rewriteProcessedLiveBytes + rewriteProcessedStaleBytes + rewriteBudgetConsumedTotal := db.vlogGenerationRewriteBudgetConsumed.Load() + rewriteChurnBps := db.vlogGenerationLastChurnBps.Load() + rewriteExecSeconds := 0.0 + if rewriteExecTotalNS > 0 { + rewriteExecSeconds = float64(rewriteExecTotalNS) / float64(time.Second) + } + rewriteBytesInPerSec := 0.0 + rewriteBytesOutPerSec := 0.0 + rewriteReclaimedBytesPerSec := 0.0 + rewriteBudgetConsumedPerSec := 0.0 + if rewriteExecSeconds > 0 { + rewriteBytesInPerSec = float64(rewriteBytesInTotal) / rewriteExecSeconds + rewriteBytesOutPerSec = float64(rewriteBytesOutTotal) / rewriteExecSeconds + rewriteReclaimedBytesPerSec = float64(rewriteReclaimedBytesTotal) / rewriteExecSeconds + rewriteBudgetConsumedPerSec = float64(rewriteBudgetConsumedTotal) / rewriteExecSeconds + } + rewriteOutputRatio := 0.0 + rewriteReclaimRatio := 0.0 + if rewriteBytesInTotal > 0 { + rewriteOutputRatio = float64(rewriteBytesOutTotal) / float64(rewriteBytesInTotal) + rewriteReclaimRatio = float64(rewriteReclaimedBytesTotal) / float64(rewriteBytesInTotal) + } + rewriteProcessedStaleRatio := 0.0 + if rewriteProcessedTotal > 0 { + rewriteProcessedStaleRatio = float64(rewriteProcessedStaleBytes) / float64(rewriteProcessedTotal) + } + rewriteBudgetConsumedSharePct := 0.0 + if db.valueLogRewriteBudgetBytes > 0 { + rewriteBudgetConsumedSharePct = (rewriteBudgetConsumedPerSec / float64(db.valueLogRewriteBudgetBytes)) * 100.0 + } + rewriteReclaimedVsChurnRatio := 0.0 + if rewriteChurnBps > 0 { + rewriteReclaimedVsChurnRatio = rewriteReclaimedBytesPerSec / float64(rewriteChurnBps) + } gcExecTotalNS := db.vlogGenerationGCExecTotalNanos.Load() gcExecMaxNS := db.vlogGenerationGCExecMaxNanos.Load() gcRuns := db.vlogGenerationGCRuns.Load() @@ -21038,7 +21078,7 @@ func (db *DB) Stats() map[string]string { stats["treedb.cache.vlog_generation.maintenance.pass.avg_ms"] = "0.000" } stats["treedb.cache.vlog_generation.churn_bytes_total"] = fmt.Sprintf("%d", db.vlogGenerationChurnBytes.Load()) - stats["treedb.cache.vlog_generation.churn_bytes_per_sec"] = fmt.Sprintf("%d", db.vlogGenerationLastChurnBps.Load()) + stats["treedb.cache.vlog_generation.churn_bytes_per_sec"] = fmt.Sprintf("%d", rewriteChurnBps) stats["treedb.cache.vlog_generation.rewrite.queue_len"] = fmt.Sprintf("%d", rewriteQueueLen) stats["treedb.cache.vlog_generation.rewrite.queue_loaded"] = fmt.Sprintf("%t", rewriteQueueLoaded) stats["treedb.cache.vlog_generation.rewrite.ledger_segments"] = fmt.Sprintf("%d", rewriteLedgerSegments) @@ -21063,7 +21103,9 @@ func (db *DB) Stats() map[string]string { stats["treedb.cache.vlog_generation.rewrite_budget.tokens_bytes"] = fmt.Sprintf("%d", rewriteBudgetTokens) stats["treedb.cache.vlog_generation.rewrite_budget.tokens_cap_bytes"] = fmt.Sprintf("%d", rewriteBudgetCap) stats["treedb.cache.vlog_generation.rewrite_budget.tokens_utilization_pct"] = fmt.Sprintf("%.3f", rewriteBudgetUtilPct) - stats["treedb.cache.vlog_generation.rewrite_budget.consumed_bytes_total"] = fmt.Sprintf("%d", db.vlogGenerationRewriteBudgetConsumed.Load()) + stats["treedb.cache.vlog_generation.rewrite_budget.consumed_bytes_total"] = fmt.Sprintf("%d", rewriteBudgetConsumedTotal) + stats["treedb.cache.vlog_generation.rewrite_budget.consumed_bytes_per_sec"] = fmt.Sprintf("%.3f", rewriteBudgetConsumedPerSec) + stats["treedb.cache.vlog_generation.rewrite_budget.consumed_share_of_budget_pct"] = fmt.Sprintf("%.3f", rewriteBudgetConsumedSharePct) stats["treedb.cache.vlog_generation.rewrite_trigger.stale_ratio_ppm"] = fmt.Sprintf("%d", db.valueLogRewriteTriggerRatioPPM) stats["treedb.cache.vlog_generation.rewrite_trigger.total_bytes"] = fmt.Sprintf("%d", db.valueLogRewriteTriggerBytes) stats["treedb.cache.vlog_generation.rewrite_trigger.churn_per_sec"] = fmt.Sprintf("%d", db.valueLogRewriteTriggerChurn) @@ -21085,10 +21127,17 @@ func (db *DB) Stats() map[string]string { stats["treedb.cache.vlog_generation.segments.hot"] = fmt.Sprintf("%d", retained.SegmentsHot) stats["treedb.cache.vlog_generation.segments.warm"] = fmt.Sprintf("%d", retained.SegmentsWarm) stats["treedb.cache.vlog_generation.segments.cold"] = fmt.Sprintf("%d", retained.SegmentsCold) - stats["treedb.cache.vlog_generation.rewrite.bytes_in"] = fmt.Sprintf("%d", db.vlogGenerationRewriteBytesIn.Load()) - stats["treedb.cache.vlog_generation.rewrite.bytes_out"] = fmt.Sprintf("%d", db.vlogGenerationRewriteBytesOut.Load()) - stats["treedb.cache.vlog_generation.rewrite.processed_live_bytes"] = fmt.Sprintf("%d", db.vlogGenerationRewriteProcessedLiveBytes.Load()) - stats["treedb.cache.vlog_generation.rewrite.processed_stale_bytes"] = fmt.Sprintf("%d", db.vlogGenerationRewriteProcessedStaleBytes.Load()) + stats["treedb.cache.vlog_generation.rewrite.bytes_in"] = fmt.Sprintf("%d", rewriteBytesInTotal) + stats["treedb.cache.vlog_generation.rewrite.bytes_out"] = fmt.Sprintf("%d", rewriteBytesOutTotal) + stats["treedb.cache.vlog_generation.rewrite.processed_live_bytes"] = fmt.Sprintf("%d", rewriteProcessedLiveBytes) + stats["treedb.cache.vlog_generation.rewrite.processed_stale_bytes"] = fmt.Sprintf("%d", rewriteProcessedStaleBytes) + stats["treedb.cache.vlog_generation.rewrite.reclaim_ratio"] = fmt.Sprintf("%.6f", rewriteReclaimRatio) + stats["treedb.cache.vlog_generation.rewrite.output_ratio"] = fmt.Sprintf("%.6f", rewriteOutputRatio) + stats["treedb.cache.vlog_generation.rewrite.processed_stale_ratio"] = fmt.Sprintf("%.6f", rewriteProcessedStaleRatio) + stats["treedb.cache.vlog_generation.rewrite.exec.bytes_in_per_sec"] = fmt.Sprintf("%.3f", rewriteBytesInPerSec) + stats["treedb.cache.vlog_generation.rewrite.exec.bytes_out_per_sec"] = fmt.Sprintf("%.3f", rewriteBytesOutPerSec) + stats["treedb.cache.vlog_generation.rewrite.exec.reclaimed_bytes_per_sec"] = fmt.Sprintf("%.3f", rewriteReclaimedBytesPerSec) + stats["treedb.cache.vlog_generation.rewrite.exec.reclaimed_vs_churn_ratio"] = fmt.Sprintf("%.6f", rewriteReclaimedVsChurnRatio) stats["treedb.cache.vlog_generation.rewrite.no_reclaim_runs"] = fmt.Sprintf("%d", db.vlogGenerationRewriteNoReclaimRuns.Load()) stats["treedb.cache.vlog_generation.rewrite.no_reclaim_stale_bytes"] = fmt.Sprintf("%d", db.vlogGenerationRewriteNoReclaimStaleBytes.Load()) stats["treedb.cache.vlog_generation.rewrite.runs"] = fmt.Sprintf("%d", db.vlogGenerationRewriteRuns.Load()) diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go index 61f4818ff..dc469bf50 100644 --- a/TreeDB/caching/vlog_generation_scheduler_test.go +++ b/TreeDB/caching/vlog_generation_scheduler_test.go @@ -6466,6 +6466,9 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) { db.vlogGenerationRewriteRuns.Store(3) db.vlogGenerationRewriteExecTotalNanos.Store(uint64((150 * time.Millisecond).Nanoseconds())) db.vlogGenerationRewriteExecMaxNanos.Store(uint64((70 * time.Millisecond).Nanoseconds())) + db.vlogGenerationRewriteBytesIn.Store(1000) + db.vlogGenerationRewriteBytesOut.Store(600) + db.vlogGenerationRewriteReclaimedBytes.Store(400) db.vlogGenerationGCRuns.Store(2) db.vlogGenerationGCExecTotalNanos.Store(uint64((60 * time.Millisecond).Nanoseconds())) db.vlogGenerationGCExecMaxNanos.Store(uint64((35 * time.Millisecond).Nanoseconds())) @@ -6474,6 +6477,8 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) { db.vlogGenerationVacuumExecMaxNanos.Store(uint64((25 * time.Millisecond).Nanoseconds())) db.vlogGenerationRewriteBudgetTokensBytes.Store(512) db.vlogGenerationRewriteBudgetConsumed.Store(1536) + db.valueLogRewriteBudgetBytes = 2048 + db.vlogGenerationLastChurnBps.Store(2500) db.vlogGenerationRewriteAgeBlockedUntilNS.Store(time.Now().Add(5 * time.Second).UnixNano()) db.vlogGenerationLastGCSegmentsReferenced.Store(7) db.vlogGenerationLastGCBytesReferenced.Store(700) @@ -6770,6 +6775,12 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) { if got := stats["treedb.cache.vlog_generation.rewrite_budget.consumed_bytes_total"]; got != "1536" { t.Fatalf("rewrite budget consumed=%q want 1536", got) } + if got := stats["treedb.cache.vlog_generation.rewrite_budget.consumed_bytes_per_sec"]; got != "10240.000" { + t.Fatalf("rewrite budget consumed bytes/sec=%q want 10240.000", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite_budget.consumed_share_of_budget_pct"]; got != "500.000" { + t.Fatalf("rewrite budget consumed share pct=%q want 500.000", got) + } if got := stats["treedb.cache.vlog_generation.rewrite_budget.tokens_cap_bytes"]; got == "0" { t.Fatalf("rewrite budget cap bytes=%q want non-zero", got) } @@ -6812,6 +6823,27 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) { if got := stats["treedb.cache.vlog_generation.rewrite.processed_stale_bytes"]; got != "450" { t.Fatalf("rewrite processed stale bytes=%q want 450", got) } + if got := stats["treedb.cache.vlog_generation.rewrite.reclaim_ratio"]; got != "0.400000" { + t.Fatalf("rewrite reclaim ratio=%q want 0.400000", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.output_ratio"]; got != "0.600000" { + t.Fatalf("rewrite output ratio=%q want 0.600000", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.processed_stale_ratio"]; got != "0.333333" { + t.Fatalf("rewrite processed stale ratio=%q want 0.333333", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.exec.bytes_in_per_sec"]; got != "6666.667" { + t.Fatalf("rewrite exec bytes in/sec=%q want 6666.667", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.exec.bytes_out_per_sec"]; got != "4000.000" { + t.Fatalf("rewrite exec bytes out/sec=%q want 4000.000", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.exec.reclaimed_bytes_per_sec"]; got != "2666.667" { + t.Fatalf("rewrite exec reclaimed bytes/sec=%q want 2666.667", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.exec.reclaimed_vs_churn_ratio"]; got != "1.066667" { + t.Fatalf("rewrite reclaimed vs churn ratio=%q want 1.066667", got) + } if got := stats["treedb.cache.vlog_generation.rewrite.no_reclaim_runs"]; got != "3" { t.Fatalf("rewrite no reclaim runs=%q want 3", got) } diff --git a/cmd/unified_bench/main.go b/cmd/unified_bench/main.go index d7737e6ed..461ae0235 100644 --- a/cmd/unified_bench/main.go +++ b/cmd/unified_bench/main.go @@ -1287,6 +1287,15 @@ func printTreeDBCacheStats(w io.Writer, inst *DBInstance, prefix string) { "treedb.cache.vlog_generation.segments.cold", "treedb.cache.vlog_generation.rewrite.bytes_in", "treedb.cache.vlog_generation.rewrite.bytes_out", + "treedb.cache.vlog_generation.rewrite.reclaim_ratio", + "treedb.cache.vlog_generation.rewrite.output_ratio", + "treedb.cache.vlog_generation.rewrite.processed_stale_ratio", + "treedb.cache.vlog_generation.rewrite.exec.bytes_in_per_sec", + "treedb.cache.vlog_generation.rewrite.exec.bytes_out_per_sec", + "treedb.cache.vlog_generation.rewrite.exec.reclaimed_bytes_per_sec", + "treedb.cache.vlog_generation.rewrite.exec.reclaimed_vs_churn_ratio", + "treedb.cache.vlog_generation.rewrite_budget.consumed_bytes_per_sec", + "treedb.cache.vlog_generation.rewrite_budget.consumed_share_of_budget_pct", "treedb.cache.vlog_generation.rewrite.runs", "treedb.cache.vlog_generation.gc.deleted_segments", "treedb.cache.vlog_generation.gc.deleted_bytes", From f031d074d3f0ab46c1d4cf0331c1b3021974cf0a Mon Sep 17 00:00:00 2001 From: Mikers Date: Sat, 28 Mar 2026 19:17:55 -1000 Subject: [PATCH 47/61] treedb: add rewrite min-age lab knob and richer rewrite stats --- TreeDB/caching/db.go | 25 +++++-- TreeDB/db/db.go | 8 +++ TreeDB/public.go | 1 + cmd/unified_bench/README.md | 1 + cmd/unified_bench/adapter_treedb.go | 7 ++ cmd/unified_bench/adapter_treedb_vlog_test.go | 17 +++++ cmd/unified_bench/main.go | 52 ++++++++++++++ .../profiles_treedb_index_test.go | 4 ++ docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md | 7 ++ ...EWRITE_MIN_SEGMENT_AGE_SWEEP_2026-03-28.md | 72 +++++++++++++++++++ scripts/celestia_fast_gate.sh | 5 ++ 11 files changed, 194 insertions(+), 5 deletions(-) create mode 100644 docs/benchmarks/VLOG_REWRITE_MIN_SEGMENT_AGE_SWEEP_2026-03-28.md diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go index ea4482b18..8fbca7141 100644 --- a/TreeDB/caching/db.go +++ b/TreeDB/caching/db.go @@ -5343,6 +5343,11 @@ type Options struct { ValueLogRewriteTriggerTotalBytes int64 // ValueLogRewriteTriggerChurnPerSec triggers rewrite by churn rate. ValueLogRewriteTriggerChurnPerSec int64 + // ValueLogRewriteMinSegmentAge gates online rewrite to source segments that + // are at least this old. + // + // 0 uses the implementation default. + ValueLogRewriteMinSegmentAge time.Duration // ForceValueLogPointers stores all values out-of-line in the value log. ForceValueLogPointers bool // DisableReadChecksum skips CRC verification on value-log reads. @@ -5589,6 +5594,7 @@ type DB struct { valueLogRewriteTriggerRatioPPM uint32 valueLogRewriteTriggerBytes int64 valueLogRewriteTriggerChurn int64 + valueLogRewriteMinSegmentAge time.Duration valueLogReader *valuelog.Manager valueLogHotLanes []int valueLogWarmLanes []int @@ -7654,6 +7660,7 @@ func Open(dir string, backend BackendDB, opts Options) (*DB, error) { valueLogRewriteTriggerRatioPPM := opts.ValueLogRewriteTriggerStaleRatioPPM valueLogRewriteTriggerBytes := opts.ValueLogRewriteTriggerTotalBytes valueLogRewriteTriggerChurn := opts.ValueLogRewriteTriggerChurnPerSec + valueLogRewriteMinSegmentAge := opts.ValueLogRewriteMinSegmentAge if valueLogGenerationHotTarget < 0 { return nil, fmt.Errorf("cachingdb: invalid value-log generational hot segment target bytes %d", valueLogGenerationHotTarget) } @@ -7675,6 +7682,9 @@ func Open(dir string, backend BackendDB, opts Options) (*DB, error) { if valueLogRewriteTriggerChurn < 0 { return nil, fmt.Errorf("cachingdb: invalid value-log generational rewrite trigger churn/sec %d", valueLogRewriteTriggerChurn) } + if valueLogRewriteMinSegmentAge < 0 { + return nil, fmt.Errorf("cachingdb: invalid value-log generational rewrite min segment age %s", valueLogRewriteMinSegmentAge) + } if valueLogGenerationPolicyUint8 == uint8(backenddb.ValueLogGenerationHotWarmCold) { if valueLogGenerationHotTarget == 0 { valueLogGenerationHotTarget = defaultVlogGenerationHotTargetBytes @@ -7695,6 +7705,9 @@ func Open(dir string, backend BackendDB, opts Options) (*DB, error) { valueLogRewriteTriggerRatioPPM = defaultVlogRewriteTriggerStalePPM } } + if valueLogRewriteMinSegmentAge == 0 { + valueLogRewriteMinSegmentAge = vlogGenerationRewriteMinSegmentAge + } valueLogRawWritevMinAvgBytes := opts.ValueLogRawWritevMinAvgBytes if valueLogRawWritevMinAvgBytes < 0 { valueLogRawWritevMinAvgBytes = 0 @@ -7992,6 +8005,7 @@ func Open(dir string, backend BackendDB, opts Options) (*DB, error) { valueLogRewriteTriggerRatioPPM: valueLogRewriteTriggerRatioPPM, valueLogRewriteTriggerBytes: valueLogRewriteTriggerBytes, valueLogRewriteTriggerChurn: valueLogRewriteTriggerChurn, + valueLogRewriteMinSegmentAge: valueLogRewriteMinSegmentAge, memtableValueLogPointers: true, indexOuterLeavesInValueLog: opts.IndexOuterLeavesInValueLog, valueLogReader: valueLogReader, @@ -14322,7 +14336,7 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog MaxSourceBytes: maxSourceBytes, MinSegmentStaleRatio: minStaleRatio, MinSegmentStaleBytes: 1, - MinSegmentAge: vlogGenerationRewriteMinSegmentAge, + MinSegmentAge: db.valueLogRewriteMinSegmentAge, } planStart := time.Now() plan, err := planner.ValueLogRewritePlan(ctx, planOpts) @@ -14481,7 +14495,7 @@ planned: MaxSourceBytes: maxSourceBytes, MinSegmentStaleRatio: minStaleRatio, MinSegmentStaleBytes: vlogGenerationRewriteMinSegmentStaleBytes, - MinSegmentAge: vlogGenerationRewriteMinSegmentAge, + MinSegmentAge: db.valueLogRewriteMinSegmentAge, }) cancel() planDur := time.Since(planStart) @@ -14535,14 +14549,14 @@ planned: db.observeVlogGenerationRewritePlanPenaltyFilter(beforePenaltyFilter, len(plan.SourceFileIDs)) } if len(plan.SourceFileIDs) == 0 { - if shouldDeferVlogGenerationRewritePlanForAge(plan, vlogGenerationRewriteMinSegmentAge) { + if shouldDeferVlogGenerationRewritePlanForAge(plan, db.valueLogRewriteMinSegmentAge) { db.setVlogGenerationRewriteAgeBlockedUntil(now.Add(plan.AgeBlockedMinRemainingAge)) db.debugVlogMaintf( "rewrite_plan pre_rewrite age_blocked segments=%d stale_bytes=%d retry_after_ms=%d min_age_ms=%d", plan.AgeBlockedSegments, plan.AgeBlockedBytesStale, plan.AgeBlockedMinRemainingAge.Milliseconds(), - vlogGenerationRewriteMinSegmentAge.Milliseconds(), + db.valueLogRewriteMinSegmentAge.Milliseconds(), ) } else { db.clearVlogGenerationRewriteAgeBlockedUntil() @@ -14759,7 +14773,7 @@ planned: rewriteOpts.MaxSourceBytes = maxSourceBytes rewriteOpts.MinSegmentStaleRatio = db.vlogGenerationRewriteMinStaleRatioForGenericPass(totalBytes) rewriteOpts.MinSegmentStaleBytes = vlogGenerationRewriteMinSegmentStaleBytes - rewriteOpts.MinSegmentAge = vlogGenerationRewriteMinSegmentAge + rewriteOpts.MinSegmentAge = db.valueLogRewriteMinSegmentAge } var ctx context.Context var cancel context.CancelFunc @@ -21109,6 +21123,7 @@ func (db *DB) Stats() map[string]string { stats["treedb.cache.vlog_generation.rewrite_trigger.stale_ratio_ppm"] = fmt.Sprintf("%d", db.valueLogRewriteTriggerRatioPPM) stats["treedb.cache.vlog_generation.rewrite_trigger.total_bytes"] = fmt.Sprintf("%d", db.valueLogRewriteTriggerBytes) stats["treedb.cache.vlog_generation.rewrite_trigger.churn_per_sec"] = fmt.Sprintf("%d", db.valueLogRewriteTriggerChurn) + stats["treedb.cache.vlog_generation.rewrite.min_segment_age_ms"] = fmt.Sprintf("%d", db.valueLogRewriteMinSegmentAge.Milliseconds()) // PR1 scaffolding: legacy allocator still owns placement; report retained // totals under hot generation until generation-aware allocator lands. stats["treedb.cache.vlog_generation.bytes.live.total"] = fmt.Sprintf("%d", retained.BytesTotal) diff --git a/TreeDB/db/db.go b/TreeDB/db/db.go index ae4b583d3..d7b2f7c05 100644 --- a/TreeDB/db/db.go +++ b/TreeDB/db/db.go @@ -280,6 +280,11 @@ type ValueLogGenerationConfig struct { // RewriteTriggerChurnPerSec triggers rewrite when churn rate exceeds // threshold (0 disables). RewriteTriggerChurnPerSec int64 + // RewriteMinSegmentAge gates online rewrite to source segments that are at + // least this old. + // + // 0 uses the implementation default. + RewriteMinSegmentAge time.Duration } // ValueLogDomainThreshold overrides inline-vs-pointer placement policy for keys @@ -968,6 +973,9 @@ func validateOptions(opts Options) error { if opts.ValueLog.Generational.RewriteTriggerChurnPerSec < 0 { return fmt.Errorf("treedb: invalid value-log generational rewrite trigger churn/sec %d", opts.ValueLog.Generational.RewriteTriggerChurnPerSec) } + if opts.ValueLog.Generational.RewriteMinSegmentAge < 0 { + return fmt.Errorf("treedb: invalid value-log generational rewrite min segment age %s", opts.ValueLog.Generational.RewriteMinSegmentAge) + } seenDomains := make(map[string]struct{}, len(opts.ValueLog.DomainInlineThresholds)) for i := range opts.ValueLog.DomainInlineThresholds { d := opts.ValueLog.DomainInlineThresholds[i] diff --git a/TreeDB/public.go b/TreeDB/public.go index 2af25ada1..8436b482a 100644 --- a/TreeDB/public.go +++ b/TreeDB/public.go @@ -590,6 +590,7 @@ func Open(opts Options) (*DB, error) { ValueLogRewriteTriggerStaleRatioPPM: opts.ValueLog.Generational.RewriteTriggerStaleRatioPPM, ValueLogRewriteTriggerTotalBytes: opts.ValueLog.Generational.RewriteTriggerTotalBytes, ValueLogRewriteTriggerChurnPerSec: opts.ValueLog.Generational.RewriteTriggerChurnPerSec, + ValueLogRewriteMinSegmentAge: opts.ValueLog.Generational.RewriteMinSegmentAge, ForceValueLogPointers: opts.ValueLog.ForcePointers, ValueLogDictTrain: opts.ValueLog.DictTrain, ValueLogDictMaxK: opts.ValueLog.DictMaxK, diff --git a/cmd/unified_bench/README.md b/cmd/unified_bench/README.md index 93c26afdd..85ef9e258 100644 --- a/cmd/unified_bench/README.md +++ b/cmd/unified_bench/README.md @@ -95,6 +95,7 @@ GOWORK=off GOMEMLIMIT=4GiB GOMAXPROCS=2 go test -json -p 1 . \ - `-treedb-allow-unsafe` TreeDB: allow unsafe durability/integrity options (required for unsafe toggles) - `-treedb-vlog-dict` TreeDB: value-log dict compression mode (`default|on|off|both`) - `-treedb-vlog-auto-policy` TreeDB: value-log auto policy (`balanced|throughput|size`) +- `-treedb-vlog-rewrite-min-segment-age-ms` TreeDB: minimum source segment age for online generational rewrite (`0`=default) - `-treedb-vlog-dict-frame-encode-level` TreeDB: dict frame zstd encoder level (`engine|fastest|default|better|best|all|`) - `-treedb-vlog-dict-frame-entropy` TreeDB: dict frame entropy mode (`engine|on|off|both`) - `-seed` PRNG seed for randomized tests (default 1; `0` = time-based) diff --git a/cmd/unified_bench/adapter_treedb.go b/cmd/unified_bench/adapter_treedb.go index 016af5206..ae19a2359 100644 --- a/cmd/unified_bench/adapter_treedb.go +++ b/cmd/unified_bench/adapter_treedb.go @@ -70,6 +70,7 @@ var ( treedbVlogRewriteTriggerStaleRatioPPM = flag.Uint("treedb-vlog-rewrite-trigger-stale-ratio-ppm", 0, "TreeDB: generational rewrite stale/live trigger in ppm (0=disabled)") treedbVlogRewriteTriggerTotalBytes = flag.Int64("treedb-vlog-rewrite-trigger-total-bytes", 0, "TreeDB: generational rewrite total retained bytes trigger (0=disabled)") treedbVlogRewriteTriggerChurnPerSec = flag.Int64("treedb-vlog-rewrite-trigger-churn-per-sec", 0, "TreeDB: generational rewrite churn trigger in bytes/sec (0=disabled)") + treedbVlogRewriteMinSegmentAgeMS = flag.Int("treedb-vlog-rewrite-min-segment-age-ms", 0, "TreeDB: generational rewrite minimum source segment age in milliseconds (0=default)") treedbVlogBlockTargetBytes = flag.Int("treedb-vlog-block-target-bytes", 0, "TreeDB: value-log block target compressed bytes (0=default)") treedbVlogIncompressibleHoldBytes = flag.Int("treedb-vlog-incompressible-hold-bytes", 0, "TreeDB: auto-mode incompressible hold bytes (0=default)") treedbVlogIncompressibleProbeBytes = flag.Int("treedb-vlog-incompressible-probe-bytes", 0, "TreeDB: auto-mode incompressible probe interval bytes (0=default)") @@ -359,6 +360,11 @@ func (r treeDBOptionsReport) formatText(indent string) string { lines = append(lines, fmt.Sprintf("vlog.rewrite_trigger_stale_ratio_ppm=%d", r.opts.ValueLog.Generational.RewriteTriggerStaleRatioPPM)) lines = append(lines, fmt.Sprintf("vlog.rewrite_trigger_total_bytes=%d", r.opts.ValueLog.Generational.RewriteTriggerTotalBytes)) lines = append(lines, fmt.Sprintf("vlog.rewrite_trigger_churn_per_sec=%d", r.opts.ValueLog.Generational.RewriteTriggerChurnPerSec)) + if minAge := r.opts.ValueLog.Generational.RewriteMinSegmentAge; minAge <= 0 { + lines = append(lines, fmt.Sprintf("vlog.rewrite_min_segment_age_ms=default (effective=%d)", int((30*time.Second)/time.Millisecond))) + } else { + lines = append(lines, fmt.Sprintf("vlog.rewrite_min_segment_age_ms=%d", int(minAge/time.Millisecond))) + } if target := r.opts.ValueLog.BlockTargetCompressedBytes; target <= 0 { lines = append(lines, "vlog.block_target_bytes=default (effective=4096B)") } else { @@ -663,6 +669,7 @@ func buildTreeDBOptions(dir string) (treedb.Options, treeDBOptionsReport, error) opts.ValueLog.Generational.RewriteTriggerStaleRatioPPM = clampUint32(uint64(*treedbVlogRewriteTriggerStaleRatioPPM)) opts.ValueLog.Generational.RewriteTriggerTotalBytes = *treedbVlogRewriteTriggerTotalBytes opts.ValueLog.Generational.RewriteTriggerChurnPerSec = *treedbVlogRewriteTriggerChurnPerSec + opts.ValueLog.Generational.RewriteMinSegmentAge = time.Duration(*treedbVlogRewriteMinSegmentAgeMS) * time.Millisecond if maintenanceMode == "bench" { // Disable background maintenance loops. "bench" mode aims for stable diff --git a/cmd/unified_bench/adapter_treedb_vlog_test.go b/cmd/unified_bench/adapter_treedb_vlog_test.go index 3a2dbd9f5..f54948848 100644 --- a/cmd/unified_bench/adapter_treedb_vlog_test.go +++ b/cmd/unified_bench/adapter_treedb_vlog_test.go @@ -131,6 +131,23 @@ func TestBuildTreeDBOptions_VlogDictClassModeFlag(t *testing.T) { } } +func TestBuildTreeDBOptions_VlogRewriteMinSegmentAgeFlag(t *testing.T) { + saved := saveTreeDBFlagState() + defer restoreTreeDBFlagState(saved) + + *treedbVlogRewriteMinSegmentAgeMS = 5000 + opts, rep, err := buildTreeDBOptions("") + if err != nil { + t.Fatalf("buildTreeDBOptions: %v", err) + } + if got := opts.ValueLog.Generational.RewriteMinSegmentAge.Milliseconds(); got != 5000 { + t.Fatalf("unexpected rewrite min segment age ms: got=%d want=5000", got) + } + if got := rep.formatText(""); !strings.Contains(got, "vlog.rewrite_min_segment_age_ms=5000") { + t.Fatalf("resolved options missing rewrite min segment age: %q", got) + } +} + func TestBuildTreeDBOptions_InvalidVlogDictClassMode(t *testing.T) { saved := saveTreeDBFlagState() defer restoreTreeDBFlagState(saved) diff --git a/cmd/unified_bench/main.go b/cmd/unified_bench/main.go index 461ae0235..e4d3aa592 100644 --- a/cmd/unified_bench/main.go +++ b/cmd/unified_bench/main.go @@ -1270,8 +1270,26 @@ func printTreeDBCacheStats(w io.Writer, inst *DBInstance, prefix string) { "treedb.cache.vlog_generation.policy", "treedb.cache.vlog_generation.scheduler_state", "treedb.cache.vlog_generation.scheduler_last_reason", + "treedb.cache.vlog_generation.maintenance_phase", + "treedb.cache.vlog_generation.maintenance.attempts", + "treedb.cache.vlog_generation.maintenance.acquired", + "treedb.cache.vlog_generation.maintenance.collisions", + "treedb.cache.vlog_generation.maintenance.skip.wal_on_periodic", + "treedb.cache.vlog_generation.maintenance.skip.maintenance_phase", + "treedb.cache.vlog_generation.maintenance.skip.stage_gate", + "treedb.cache.vlog_generation.maintenance.skip.stage_gate_not_due", + "treedb.cache.vlog_generation.maintenance.skip.stage_gate_due_reserved", + "treedb.cache.vlog_generation.maintenance.skip.age_blocked_gate", + "treedb.cache.vlog_generation.maintenance.skip.priority_pending", + "treedb.cache.vlog_generation.maintenance.skip.quiet_window", + "treedb.cache.vlog_generation.maintenance.skip.before_first_checkpoint", + "treedb.cache.vlog_generation.maintenance.skip.checkpoint_inflight", "treedb.cache.vlog_generation.churn_bytes_total", "treedb.cache.vlog_generation.churn_bytes_per_sec", + "treedb.cache.vlog_generation.rewrite_trigger.stale_ratio_ppm", + "treedb.cache.vlog_generation.rewrite_trigger.total_bytes", + "treedb.cache.vlog_generation.rewrite_trigger.churn_per_sec", + "treedb.cache.vlog_generation.rewrite.min_segment_age_ms", "treedb.cache.vlog_generation.bytes.live.total", "treedb.cache.vlog_generation.bytes.live.hot", "treedb.cache.vlog_generation.bytes.live.warm", @@ -1285,6 +1303,29 @@ func printTreeDBCacheStats(w io.Writer, inst *DBInstance, prefix string) { "treedb.cache.vlog_generation.segments.hot", "treedb.cache.vlog_generation.segments.warm", "treedb.cache.vlog_generation.segments.cold", + "treedb.cache.vlog_generation.rewrite.queue_len", + "treedb.cache.vlog_generation.rewrite.queue_loaded", + "treedb.cache.vlog_generation.rewrite.ledger_segments", + "treedb.cache.vlog_generation.rewrite.ledger_bytes_total", + "treedb.cache.vlog_generation.rewrite.ledger_bytes_live", + "treedb.cache.vlog_generation.rewrite.ledger_bytes_stale", + "treedb.cache.vlog_generation.rewrite.ledger_stale_ratio_ppm", + "treedb.cache.vlog_generation.rewrite.stage_pending", + "treedb.cache.vlog_generation.rewrite.stage_observed_unix_nano", + "treedb.cache.vlog_generation.rewrite.penalties_active", + "treedb.cache.vlog_generation.rewrite.age_blocked_until_unix_nano", + "treedb.cache.vlog_generation.rewrite.age_blocked_remaining_ms", + "treedb.cache.vlog_generation.rewrite.plan_runs", + "treedb.cache.vlog_generation.rewrite.plan_canceled", + "treedb.cache.vlog_generation.rewrite.plan_errors", + "treedb.cache.vlog_generation.rewrite.plan_empty", + "treedb.cache.vlog_generation.rewrite.plan_empty.age_blocked", + "treedb.cache.vlog_generation.rewrite.plan_empty.no_selection", + "treedb.cache.vlog_generation.rewrite.plan_selected", + "treedb.cache.vlog_generation.rewrite.plan_selected_segments_total", + "treedb.cache.vlog_generation.rewrite.plan_selected_bytes_total", + "treedb.cache.vlog_generation.rewrite.plan_selected_bytes_live", + "treedb.cache.vlog_generation.rewrite.plan_selected_bytes_stale", "treedb.cache.vlog_generation.rewrite.bytes_in", "treedb.cache.vlog_generation.rewrite.bytes_out", "treedb.cache.vlog_generation.rewrite.reclaim_ratio", @@ -1294,8 +1335,19 @@ func printTreeDBCacheStats(w io.Writer, inst *DBInstance, prefix string) { "treedb.cache.vlog_generation.rewrite.exec.bytes_out_per_sec", "treedb.cache.vlog_generation.rewrite.exec.reclaimed_bytes_per_sec", "treedb.cache.vlog_generation.rewrite.exec.reclaimed_vs_churn_ratio", + "treedb.cache.vlog_generation.rewrite.no_reclaim_runs", + "treedb.cache.vlog_generation.rewrite.no_reclaim_stale_bytes", + "treedb.cache.vlog_generation.rewrite.canceled_runs", + "treedb.cache.vlog_generation.rewrite.deadline_runs", + "treedb.cache.vlog_generation.rewrite.ineffective_runs", "treedb.cache.vlog_generation.rewrite_budget.consumed_bytes_per_sec", "treedb.cache.vlog_generation.rewrite_budget.consumed_share_of_budget_pct", + "treedb.cache.vlog_generation.rewrite_budget.bytes_per_sec", + "treedb.cache.vlog_generation.rewrite_budget.records_per_sec", + "treedb.cache.vlog_generation.rewrite_budget.tokens_bytes", + "treedb.cache.vlog_generation.rewrite_budget.tokens_cap_bytes", + "treedb.cache.vlog_generation.rewrite_budget.tokens_utilization_pct", + "treedb.cache.vlog_generation.rewrite_budget.consumed_bytes_total", "treedb.cache.vlog_generation.rewrite.runs", "treedb.cache.vlog_generation.gc.deleted_segments", "treedb.cache.vlog_generation.gc.deleted_bytes", diff --git a/cmd/unified_bench/profiles_treedb_index_test.go b/cmd/unified_bench/profiles_treedb_index_test.go index 80720f73c..36a897562 100644 --- a/cmd/unified_bench/profiles_treedb_index_test.go +++ b/cmd/unified_bench/profiles_treedb_index_test.go @@ -165,6 +165,7 @@ type savedTreeDBFlagState struct { vlogGenColdBytes int64 vlogRewriteBudgetBPS int64 vlogRewriteBudgetRPS int + vlogRewriteMinAgeMS int disableWAL bool relaxedSync bool disableChecksum bool @@ -197,6 +198,7 @@ func saveTreeDBFlagState() savedTreeDBFlagState { vlogGenColdBytes: *treedbVlogGenerationColdSegmentBytes, vlogRewriteBudgetBPS: *treedbVlogRewriteBudgetBytesPerSec, vlogRewriteBudgetRPS: *treedbVlogRewriteBudgetRecordsPerSec, + vlogRewriteMinAgeMS: *treedbVlogRewriteMinSegmentAgeMS, disableWAL: *treedbDisableWAL, relaxedSync: *treedbRelaxedSync, disableChecksum: *treedbDisableReadChecksum, @@ -225,6 +227,7 @@ func restoreTreeDBFlagState(s savedTreeDBFlagState) { *treedbVlogGenerationColdSegmentBytes = s.vlogGenColdBytes *treedbVlogRewriteBudgetBytesPerSec = s.vlogRewriteBudgetBPS *treedbVlogRewriteBudgetRecordsPerSec = s.vlogRewriteBudgetRPS + *treedbVlogRewriteMinSegmentAgeMS = s.vlogRewriteMinAgeMS *treedbDisableWAL = s.disableWAL *treedbRelaxedSync = s.relaxedSync *treedbDisableReadChecksum = s.disableChecksum @@ -252,6 +255,7 @@ func resetTreeDBIndexFlagsForTest() { *treedbVlogGenerationColdSegmentBytes = 0 *treedbVlogRewriteBudgetBytesPerSec = 0 *treedbVlogRewriteBudgetRecordsPerSec = 0 + *treedbVlogRewriteMinSegmentAgeMS = 0 *treedbDisableWAL = false *treedbRelaxedSync = false *treedbDisableReadChecksum = false diff --git a/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md b/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md index 249a2b753..648120a26 100644 --- a/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md +++ b/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md @@ -10,6 +10,7 @@ - `-treedb-vlog-generation-policy hot_warm_cold` - `-treedb-vlog-rewrite-trigger-total-bytes` set for your dataset size - `-treedb-vlog-rewrite-budget-bytes-per-sec` and/or `-treedb-vlog-rewrite-budget-records-per-sec` +- `-treedb-vlog-rewrite-min-segment-age-ms` keep default for production; lower only for short-loop experiments ## Maintenance Model - Rewrite: threshold-triggered and budget-bounded. @@ -23,6 +24,12 @@ Primary keys: - `treedb.cache.vlog_generation.scheduler_state` - `treedb.cache.vlog_generation.scheduler_last_reason` - `treedb.cache.vlog_generation.churn_bytes_per_sec` +- `treedb.cache.vlog_generation.rewrite.min_segment_age_ms` +- `treedb.cache.vlog_generation.rewrite.plan_runs` +- `treedb.cache.vlog_generation.rewrite.plan_empty` +- `treedb.cache.vlog_generation.rewrite.plan_empty.age_blocked` +- `treedb.cache.vlog_generation.rewrite.plan_selected` +- `treedb.cache.vlog_generation.rewrite.ledger_bytes_stale` - `treedb.cache.vlog_generation.rewrite.runs` - `treedb.cache.vlog_generation.rewrite.bytes_in` - `treedb.cache.vlog_generation.rewrite.bytes_out` diff --git a/docs/benchmarks/VLOG_REWRITE_MIN_SEGMENT_AGE_SWEEP_2026-03-28.md b/docs/benchmarks/VLOG_REWRITE_MIN_SEGMENT_AGE_SWEEP_2026-03-28.md new file mode 100644 index 000000000..6f3608eb7 --- /dev/null +++ b/docs/benchmarks/VLOG_REWRITE_MIN_SEGMENT_AGE_SWEEP_2026-03-28.md @@ -0,0 +1,72 @@ +# VLOG Rewrite Min-Segment-Age Sweep (2026-03-28) + +## Goal + +Evaluate whether lowering online rewrite min-segment-age improves short-loop +signal without harming sync-time or end-of-run app-dir size. + +## Workload + +- Command core: + - `./bin/unified-bench` + - `-profile fast` + - `-dbs treedb` + - `-keys 900000` + - `-valsize 256` + - `-batchsize 4000` + - `-test batch_write_steady,random_write` + - `-val-pattern celestia_height_prefix_fill` + - `-checkpoint-every-bytes 4194304` + - `-treedb-force-value-pointers=true` + - `-treedb-vlog-compression dict` + - `-treedb-vlog-compression-autotune aggressive` + - `-treedb-vlog-generation-policy hot_warm_cold` + - `-treedb-vlog-rewrite-trigger-total-bytes 1` + - `-treedb-vlog-rewrite-trigger-stale-ratio-ppm 1` + - `-treedb-vlog-rewrite-trigger-churn-per-sec 1` + - `-treedb-vlog-rewrite-budget-bytes-per-sec 134217728` + - `-treedb-cache-stats-after-tests=true` + +- Swept: + - default (effective 30000ms) + - `-treedb-vlog-rewrite-min-segment-age-ms 1000` + - `-treedb-vlog-rewrite-min-segment-age-ms 5000` + - `-treedb-vlog-rewrite-min-segment-age-ms 10000` + +## Results + +| min age | rewrite activity | dir bytes | wal bytes | note | +|---|---:|---:|---:|---| +| default (30000ms) | rewrite_runs=0, plan_empty.age_blocked=1 | 567,439,668 | 553,889,306 | baseline behavior | +| 1000ms | rewrite_runs=1, plan_selected=1, gc_runs=1 | 702,734,421 | 685,611,243 | clear regression | +| 5000ms | rewrite_runs=0, plan_empty.age_blocked=1 | 567,406,884 | 553,889,290 | effectively baseline | +| 10000ms | rewrite_runs=0, plan_empty.age_blocked=1 | 567,439,650 | 553,889,288 | effectively baseline | + +Observed for the regressing 1000ms run: + +- `rewrite.bytes_in` ~= 64MB +- `rewrite.bytes_out` ~= 528MB +- `rewrite.reclaim_ratio` = `0.000000` +- `gc.deleted_segments` = `0` + +Interpretation: rewrite executes too early and amplifies bytes without reclaim, +so this setting is not suitable for production-like loops. + +## Interleaved A/B confirmation + +Using `scripts/celestia_fast_gate.sh` with same binaries and only this flag as +candidate delta (`CANDIDATE_EXTRA_FLAGS='-treedb-vlog-rewrite-min-segment-age-ms 1'`): + +- Output: `/tmp/gomap_minage_gate_ctr4Ji/gate` +- Decision: `clear_regression` +- Completed pairs: 2 +- Median delta (`candidate - control`): + - `s_sync_app_bytes`: +135,580,501.5 + - `t_sync_seconds`: +13 + +## Conclusion + +- Keep default min-segment-age for normal runs. +- Keep the flag as an explicit lab-only override for controlled scheduler + experiments. +- Do not enable low values (1ms/1000ms) in gate/default configs. diff --git a/scripts/celestia_fast_gate.sh b/scripts/celestia_fast_gate.sh index f79292e0f..59d93e551 100755 --- a/scripts/celestia_fast_gate.sh +++ b/scripts/celestia_fast_gate.sh @@ -40,6 +40,7 @@ VLOG_COMPRESSION_AUTOTUNE="${VLOG_COMPRESSION_AUTOTUNE:-aggressive}" VLOG_COMPRESSION_VARIANT="${VLOG_COMPRESSION_VARIANT:-dict}" DICT_TRAIN_BYTES="${DICT_TRAIN_BYTES:-1048576}" DICT_BYTES="${DICT_BYTES:-32768}" +VLOG_REWRITE_MIN_SEGMENT_AGE_MS="${VLOG_REWRITE_MIN_SEGMENT_AGE_MS:-}" REWRITE_ENABLED="${REWRITE_ENABLED:-1}" REWRITE_ARGS="${REWRITE_ARGS:--rw}" @@ -255,6 +256,9 @@ run_variant() { -treedb-vlog-dict-train-bytes "$DICT_TRAIN_BYTES" -treedb-vlog-dict-dict-bytes "$DICT_BYTES" ) + if [[ -n "$VLOG_REWRITE_MIN_SEGMENT_AGE_MS" ]]; then + cmd+=(-treedb-vlog-rewrite-min-segment-age-ms "$VLOG_REWRITE_MIN_SEGMENT_AGE_MS") + fi if [[ -n "$COMMON_EXTRA_FLAGS" ]]; then # shellcheck disable=SC2206 @@ -737,6 +741,7 @@ vlog_compression_autotune=$VLOG_COMPRESSION_AUTOTUNE vlog_compression_variant=$VLOG_COMPRESSION_VARIANT dict_train_bytes=$DICT_TRAIN_BYTES dict_bytes=$DICT_BYTES +vlog_rewrite_min_segment_age_ms=$VLOG_REWRITE_MIN_SEGMENT_AGE_MS rewrite_enabled=$REWRITE_ENABLED rewrite_args=$REWRITE_ARGS measure_gzip=$MEASURE_GZIP From 4a52d51fff5049c8af0238172ccd662dfbe9613e Mon Sep 17 00:00:00 2001 From: Mikers Date: Sat, 28 Mar 2026 19:35:02 -1000 Subject: [PATCH 48/61] treedb: reuse decode scratch in online rewrite reads --- TreeDB/db/vlog_rewrite.go | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/TreeDB/db/vlog_rewrite.go b/TreeDB/db/vlog_rewrite.go index ab5b5eb87..be3c3d069 100644 --- a/TreeDB/db/vlog_rewrite.go +++ b/TreeDB/db/vlog_rewrite.go @@ -31,6 +31,7 @@ const defaultValueLogRewriteSegmentBytes = 128 << 20 const rewriteDictMinPayloadBytes = 32 << 10 const rewriteDictBatchMaxK = 64 +const rewriteReadScratchMaxCap = 1 << 20 // 1MiB cap to avoid retaining oversized decode buffers func rewriteAllowDictForSmallPayload(value []byte) bool { if len(value) < page.PageSize { @@ -1230,6 +1231,7 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl swaps := make([]rewriteSwap, 0, batchSize) localityPolicy := normalizeValueLogRewriteLocalityPolicy(opts.LocalityPolicy) candidates := make([]rewriteCandidate, 0, batchSize) + var rewriteReadScratch []byte var canceledErr error flushBatch := func() error { @@ -1243,7 +1245,7 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl return err } for _, candidate := range candidates { - val, err := db.valueLogManager.Read(candidate.oldPtr) + val, usedScratch, err := db.valueLogManager.ReadUnsafeTo(candidate.oldPtr, rewriteReadScratch) if err != nil { return err } @@ -1251,6 +1253,15 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl if err != nil { return err } + if usedScratch { + // Reuse decode storage across records to reduce alloc churn while + // bounding retained capacity to avoid RSS blow-ups on outliers. + if cap(val) > rewriteReadScratchMaxCap { + rewriteReadScratch = nil + } else { + rewriteReadScratch = val[:0] + } + } startRID++ stats.RecordsCopied++ swaps = append(swaps, rewriteSwap{ From f764ab93cee5e357e712ab299d5b50ccf3bd43bf Mon Sep 17 00:00:00 2001 From: Mikers Date: Sat, 28 Mar 2026 19:41:52 -1000 Subject: [PATCH 49/61] treedb: expose rewrite value vs leaf copy stats --- TreeDB/caching/db.go | 24 ++++++++++++++++++ TreeDB/db/vlog_rewrite.go | 52 +++++++++++++++++++++++++-------------- TreeDB/vlog_rewrite.go | 4 +++ cmd/unified_bench/main.go | 4 +++ 4 files changed, 65 insertions(+), 19 deletions(-) diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go index 8fbca7141..ec960e5bd 100644 --- a/TreeDB/caching/db.go +++ b/TreeDB/caching/db.go @@ -5856,6 +5856,10 @@ type DB struct { vlogGenerationRewriteBytesIn atomic.Uint64 vlogGenerationRewriteBytesOut atomic.Uint64 vlogGenerationRewriteReclaimedBytes atomic.Uint64 + vlogGenerationRewriteValueRecordsCopied atomic.Uint64 + vlogGenerationRewriteValueBytesCopied atomic.Uint64 + vlogGenerationRewriteLeafRefRecordsCopied atomic.Uint64 + vlogGenerationRewriteLeafRefBytesCopied atomic.Uint64 vlogGenerationRewriteProcessedLiveBytes atomic.Uint64 vlogGenerationRewriteProcessedStaleBytes atomic.Uint64 vlogGenerationRewriteNoReclaimRuns atomic.Uint64 @@ -15117,6 +15121,18 @@ planned: if stats.RecordsCopied > 0 { db.vlogGenerationRemapSuccesses.Add(uint64(stats.RecordsCopied)) } + if stats.ValueRecordsCopied > 0 { + db.vlogGenerationRewriteValueRecordsCopied.Add(uint64(stats.ValueRecordsCopied)) + } + if stats.ValueBytesCopied > 0 { + db.vlogGenerationRewriteValueBytesCopied.Add(uint64(stats.ValueBytesCopied)) + } + if stats.LeafRefRecordsCopied > 0 { + db.vlogGenerationRewriteLeafRefRecordsCopied.Add(uint64(stats.LeafRefRecordsCopied)) + } + if stats.LeafRefBytesCopied > 0 { + db.vlogGenerationRewriteLeafRefBytesCopied.Add(uint64(stats.LeafRefBytesCopied)) + } if consumed > 0 { db.vlogGenerationConsumeRewriteBudgetBytes(consumed) } @@ -20954,6 +20970,10 @@ func (db *DB) Stats() map[string]string { rewriteBytesInTotal := db.vlogGenerationRewriteBytesIn.Load() rewriteBytesOutTotal := db.vlogGenerationRewriteBytesOut.Load() rewriteReclaimedBytesTotal := db.vlogGenerationRewriteReclaimedBytes.Load() + rewriteValueRecordsCopiedTotal := db.vlogGenerationRewriteValueRecordsCopied.Load() + rewriteValueBytesCopiedTotal := db.vlogGenerationRewriteValueBytesCopied.Load() + rewriteLeafRefRecordsCopiedTotal := db.vlogGenerationRewriteLeafRefRecordsCopied.Load() + rewriteLeafRefBytesCopiedTotal := db.vlogGenerationRewriteLeafRefBytesCopied.Load() rewriteProcessedLiveBytes := db.vlogGenerationRewriteProcessedLiveBytes.Load() rewriteProcessedStaleBytes := db.vlogGenerationRewriteProcessedStaleBytes.Load() rewriteProcessedTotal := rewriteProcessedLiveBytes + rewriteProcessedStaleBytes @@ -21144,6 +21164,10 @@ func (db *DB) Stats() map[string]string { stats["treedb.cache.vlog_generation.segments.cold"] = fmt.Sprintf("%d", retained.SegmentsCold) stats["treedb.cache.vlog_generation.rewrite.bytes_in"] = fmt.Sprintf("%d", rewriteBytesInTotal) stats["treedb.cache.vlog_generation.rewrite.bytes_out"] = fmt.Sprintf("%d", rewriteBytesOutTotal) + stats["treedb.cache.vlog_generation.rewrite.value_records_copied"] = fmt.Sprintf("%d", rewriteValueRecordsCopiedTotal) + stats["treedb.cache.vlog_generation.rewrite.value_bytes_copied"] = fmt.Sprintf("%d", rewriteValueBytesCopiedTotal) + stats["treedb.cache.vlog_generation.rewrite.leafref_records_copied"] = fmt.Sprintf("%d", rewriteLeafRefRecordsCopiedTotal) + stats["treedb.cache.vlog_generation.rewrite.leafref_bytes_copied"] = fmt.Sprintf("%d", rewriteLeafRefBytesCopiedTotal) stats["treedb.cache.vlog_generation.rewrite.processed_live_bytes"] = fmt.Sprintf("%d", rewriteProcessedLiveBytes) stats["treedb.cache.vlog_generation.rewrite.processed_stale_bytes"] = fmt.Sprintf("%d", rewriteProcessedStaleBytes) stats["treedb.cache.vlog_generation.rewrite.reclaim_ratio"] = fmt.Sprintf("%.6f", rewriteReclaimRatio) diff --git a/TreeDB/db/vlog_rewrite.go b/TreeDB/db/vlog_rewrite.go index be3c3d069..133d1de4b 100644 --- a/TreeDB/db/vlog_rewrite.go +++ b/TreeDB/db/vlog_rewrite.go @@ -50,6 +50,14 @@ type ValueLogRewriteStats struct { BytesBefore int64 BytesAfter int64 RecordsCopied int + // Value* counters track key/value-pointer payload copied by the main rewrite + // pointer swap path. + ValueRecordsCopied int + ValueBytesCopied int64 + // LeafRef* counters track outer-leaf page payload copied by the leaf-ref + // rewrite path (indexOuterLeavesInValueLog mode). + LeafRefRecordsCopied int + LeafRefBytesCopied int64 // SourceSegmentsRequested is the number of source segments selected for this // rewrite run after applying selection filters. SourceSegmentsRequested int @@ -1264,6 +1272,8 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl } startRID++ stats.RecordsCopied++ + stats.ValueRecordsCopied++ + stats.ValueBytesCopied += int64(len(val)) swaps = append(swaps, rewriteSwap{ key: candidate.key, oldPtr: candidate.oldPtr, @@ -1334,11 +1344,13 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl // referenced leaf pages out of the selected source segments so cleanup can // actually reclaim space. if restrictSource && db.indexOuterLeavesInValueLog && len(sourceIDs) > 0 { - copied, err := db.rewriteLeafRefsOnline(ctx, writer, ridAlloc, sourceIDs, opts.SyncEachBatch) + copied, copiedBytes, err := db.rewriteLeafRefsOnline(ctx, writer, ridAlloc, sourceIDs, opts.SyncEachBatch) if err != nil { return stats, err } stats.RecordsCopied += copied + stats.LeafRefRecordsCopied += copied + stats.LeafRefBytesCopied += copiedBytes } } else { // Stop publishing further swaps after cancellation; cleanup below still @@ -1484,8 +1496,9 @@ type leafRefRewriteCtx struct { leafMap map[uint64]uint64 // old leafref id -> new leafref id internalMap map[uint64]uint64 // old internal page id -> new page id - retired []uint64 - copied int + retired []uint64 + copied int + copiedBytes int64 } func (c *leafRefRewriteCtx) rewriteNode(id uint64) (uint64, bool, error) { @@ -1542,6 +1555,7 @@ func (c *leafRefRewriteCtx) rewriteNode(id uint64) (uint64, bool, error) { } c.leafMap[id] = leafID c.copied++ + c.copiedBytes += int64(len(leafPage)) return leafID, true, nil } @@ -1639,32 +1653,32 @@ func (c *leafRefRewriteCtx) rewriteNode(id uint64) (uint64, bool, error) { } } -func (db *DB) rewriteLeafRefsOnline(ctx context.Context, writer *rewriteWriter, ridAlloc *rewriteRIDAllocator, sourceIDs map[uint32]struct{}, sync bool) (copied int, err error) { +func (db *DB) rewriteLeafRefsOnline(ctx context.Context, writer *rewriteWriter, ridAlloc *rewriteRIDAllocator, sourceIDs map[uint32]struct{}, sync bool) (copied int, copiedBytes int64, err error) { if db == nil { - return 0, fmt.Errorf("missing db") + return 0, 0, fmt.Errorf("missing db") } if !db.indexOuterLeavesInValueLog { - return 0, nil + return 0, 0, nil } if db.readOnly { - return 0, ErrReadOnly + return 0, 0, ErrReadOnly } if db.valueLogManager == nil { - return 0, fmt.Errorf("value log manager unavailable") + return 0, 0, fmt.Errorf("value log manager unavailable") } if writer == nil || ridAlloc == nil { - return 0, fmt.Errorf("vlog-rewrite: missing writer/rid state") + return 0, 0, fmt.Errorf("vlog-rewrite: missing writer/rid state") } // Treat nil sourceIDs as "all sources" and an empty, non-nil map as "no // sources". The latter means there is nothing to rewrite. if sourceIDs != nil && len(sourceIDs) == 0 { - return 0, nil + return 0, 0, nil } if ctx == nil { ctx = context.Background() } if err := ctx.Err(); err != nil { - return 0, err + return 0, 0, err } db.writeMu.Lock() @@ -1673,7 +1687,7 @@ func (db *DB) rewriteLeafRefsOnline(ctx context.Context, writer *rewriteWriter, snap := db.AcquireSnapshot() if snap == nil || snap.idx == nil || snap.state == nil { closeRewriteSnapshot(&err, snap) - return 0, fmt.Errorf("missing snapshot state") + return 0, 0, fmt.Errorf("missing snapshot state") } defer closeRewriteSnapshot(&err, snap) @@ -1710,33 +1724,33 @@ func (db *DB) rewriteLeafRefsOnline(ctx context.Context, writer *rewriteWriter, newSysRoot, sysChanged, err := leafCtx.rewriteNode(sysRoot) if err != nil { - return 0, err + return 0, 0, err } newRoot, userChanged, err := leafCtx.rewriteNode(rootID) if err != nil { - return 0, err + return 0, 0, err } if !sysChanged && !userChanged { - return 0, nil + return 0, 0, nil } // Ensure the copied leaf-page records are visible before publishing new leaf // refs that point at them. if sync { if err := writer.Sync(); err != nil { - return 0, err + return 0, 0, err } } else { if err := writer.Flush(); err != nil { - return 0, err + return 0, 0, err } } if err := db.finalizeCommit(newRoot, newSysRoot, leafCtx.retired, sync, adaptive.Metrics{}, nil, db.indexOuterLeavesInValueLog, nil); err != nil { - return 0, err + return 0, 0, err } tracker = nil - return leafCtx.copied, nil + return leafCtx.copied, leafCtx.copiedBytes, nil } func nextRewriteRIDStart(segments []logSegment) (uint64, error) { diff --git a/TreeDB/vlog_rewrite.go b/TreeDB/vlog_rewrite.go index 5e60b37da..e685aec54 100644 --- a/TreeDB/vlog_rewrite.go +++ b/TreeDB/vlog_rewrite.go @@ -13,6 +13,10 @@ type ValueLogRewriteStats struct { BytesBefore int64 BytesAfter int64 RecordsCopied int + ValueRecordsCopied int + ValueBytesCopied int64 + LeafRefRecordsCopied int + LeafRefBytesCopied int64 SourceSegmentsRequested int SourceSegmentsStillReferenced int SourceSegmentsUnreferenced int diff --git a/cmd/unified_bench/main.go b/cmd/unified_bench/main.go index e4d3aa592..aebc1ff9d 100644 --- a/cmd/unified_bench/main.go +++ b/cmd/unified_bench/main.go @@ -1328,6 +1328,10 @@ func printTreeDBCacheStats(w io.Writer, inst *DBInstance, prefix string) { "treedb.cache.vlog_generation.rewrite.plan_selected_bytes_stale", "treedb.cache.vlog_generation.rewrite.bytes_in", "treedb.cache.vlog_generation.rewrite.bytes_out", + "treedb.cache.vlog_generation.rewrite.value_records_copied", + "treedb.cache.vlog_generation.rewrite.value_bytes_copied", + "treedb.cache.vlog_generation.rewrite.leafref_records_copied", + "treedb.cache.vlog_generation.rewrite.leafref_bytes_copied", "treedb.cache.vlog_generation.rewrite.reclaim_ratio", "treedb.cache.vlog_generation.rewrite.output_ratio", "treedb.cache.vlog_generation.rewrite.processed_stale_ratio", From f688e39209880fd833cfa9ed1a1b4e37f0498803 Mon Sep 17 00:00:00 2001 From: Mikers Date: Sat, 28 Mar 2026 20:04:18 -1000 Subject: [PATCH 50/61] unified-bench: print retained-prune stats --- cmd/unified_bench/main.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cmd/unified_bench/main.go b/cmd/unified_bench/main.go index aebc1ff9d..76053da17 100644 --- a/cmd/unified_bench/main.go +++ b/cmd/unified_bench/main.go @@ -1356,6 +1356,14 @@ func printTreeDBCacheStats(w io.Writer, inst *DBInstance, prefix string) { "treedb.cache.vlog_generation.gc.deleted_segments", "treedb.cache.vlog_generation.gc.deleted_bytes", "treedb.cache.vlog_generation.gc.runs", + "treedb.cache.vlog_retained_prune.runs", + "treedb.cache.vlog_retained_prune.forced_runs", + "treedb.cache.vlog_retained_prune.removed_segments", + "treedb.cache.vlog_retained_prune.removed_bytes", + "treedb.cache.vlog_retained_prune.live_skipped_segments", + "treedb.cache.vlog_retained_prune.live_skipped_bytes", + "treedb.cache.vlog_retained_prune.zombie_marked_segments", + "treedb.cache.vlog_retained_prune.zombie_marked_bytes", "treedb.cache.vlog_generation.vacuum.runs", "treedb.cache.vlog_generation.vacuum.failures", "treedb.cache.vlog_generation.remap.successes", From 186801c643916a9b23cb90c7cf798c1dd3d0e87b Mon Sep 17 00:00:00 2001 From: Mikers Date: Sat, 28 Mar 2026 20:19:53 -1000 Subject: [PATCH 51/61] valuelog: cache grouped compressed fallback reads --- TreeDB/internal/valuelog/manager.go | 181 ++++++++++++++++++++++ TreeDB/internal/valuelog/valuelog_test.go | 86 ++++++++++ 2 files changed, 267 insertions(+) diff --git a/TreeDB/internal/valuelog/manager.go b/TreeDB/internal/valuelog/manager.go index 230bcef3a..f6102a29d 100644 --- a/TreeDB/internal/valuelog/manager.go +++ b/TreeDB/internal/valuelog/manager.go @@ -504,6 +504,11 @@ func (f *File) ReadUnsafeTo(ptr page.ValuePtr, verifyCRC bool, dst []byte) ([]by } } f.mmapReadFallbackReadAt.Add(1) + if !verifyCRC { + if val, usedDst, err, ok := f.readGroupedCompressedFromFileTo(ptr, dst); ok { + return val, usedDst, err + } + } return ReadAtWithDictTo(f.File, ptr, verifyCRC, f.dictLookup, f.templateLookup, f.templateDefCache, f.templateDecodeOpts, dst) } // Avoid per-read Stat/lock churn once we have exhausted the dead-mapping @@ -519,9 +524,185 @@ func (f *File) ReadUnsafeTo(ptr page.ValuePtr, verifyCRC bool, dst []byte) ([]by f.mmapReadMissDeadMappingCap.Add(1) } f.mmapReadFallbackReadAt.Add(1) + if !verifyCRC { + if val, usedDst, err, ok := f.readGroupedCompressedFromFileTo(ptr, dst); ok { + return val, usedDst, err + } + } return ReadAtWithDictTo(f.File, ptr, verifyCRC, f.dictLookup, f.templateLookup, f.templateDefCache, f.templateDecodeOpts, dst) } +// readGroupedCompressedFromFileTo handles grouped+compressed reads on the +// non-mmap fallback path while reusing File grouped-frame cache entries. +// +// ok=false means the caller should fall back to the generic ReadAtWithDictTo +// decoder path (for non-grouped / uncompressed / checksum-verified cases). +func (f *File) readGroupedCompressedFromFileTo(ptr page.ValuePtr, dst []byte) ([]byte, bool, error, bool) { + if f == nil || f.File == nil { + return nil, false, errors.New("valuelog: nil file"), true + } + if ptr.Offset < 4 || !page.ValuePtrIsGrouped(ptr) { + return nil, false, nil, false + } + + start := int64(ptr.Offset - 4) + var header [HeaderSize]byte + if _, err := f.File.ReadAt(header[:], start); err != nil { + return nil, false, err, true + } + if header[4] != Version { + return nil, false, ErrCorrupt, true + } + if header[5]&recordFlagGrouped == 0 { + return nil, false, nil, false + } + valueLen := binary.LittleEndian.Uint32(header[16:20]) + if recordSizeExceedsMax(valueLen) { + return nil, false, ErrRecordTooLarge, true + } + expectedLen := uint32(headerWithoutCRC) + valueLen + if !page.ValuePtrRecordLengthHintMatches(ptr, expectedLen) { + return nil, false, ErrCorrupt, true + } + if int(valueLen) < FrameHeaderSize { + return nil, false, ErrCorrupt, true + } + + frameOff := start + HeaderSize + var frameHeader [FrameHeaderSize]byte + if _, err := f.File.ReadAt(frameHeader[:], frameOff); err != nil { + return nil, false, err, true + } + if frameHeader[0] != FrameVersion { + return nil, false, ErrCorrupt, true + } + k := int(frameHeader[2]) + if k <= 0 || k > MaxFrameK { + return nil, false, ErrCorrupt, true + } + if frameHeader[1]&FrameFlagCompressed == 0 { + return nil, false, nil, false + } + + subIndex := int(page.ValuePtrSubIndex(ptr)) + if subIndex < 0 || subIndex >= k { + return nil, false, ErrCorrupt, true + } + if cachedRaw, valStart, valEnd, rawLen, hit := f.groupedFrameCacheLookup(start, false, subIndex); hit { + if uint32(len(cachedRaw)) != rawLen || valEnd < valStart || valEnd > rawLen { + return nil, false, ErrCorrupt, true + } + val := cachedRaw[valStart:valEnd] + if f.templateLookup != nil && templ.IsEncodedPayload(val) { + decoded, err := templ.DecodePayloadAppend(nil, val, func(id uint64) (templ.TemplateDef, error) { + return resolveTemplateDef(id, f.templateLookup, f.templateDefCache) + }, f.templateDecodeOpts) + if err != nil { + return nil, false, err, true + } + return decoded, false, nil, true + } + if dst != nil && cap(dst) >= len(val) { + out := dst[:len(val)] + copy(out, val) + return out, true, nil, true + } + out := make([]byte, len(val)) + copy(out, val) + return out, false, nil, true + } + + ridBytes := k * 8 + offsetBytes := (k + 1) * 4 + prefixLen := FrameHeaderSize + ridBytes + offsetBytes + if int(valueLen) < prefixLen { + return nil, false, ErrCorrupt, true + } + + payloadScratch := getDecodeScratch(int(valueLen)) + defer putDecodeScratch(payloadScratch) + payload := payloadScratch[:int(valueLen)] + if _, err := f.File.ReadAt(payload, start+HeaderSize); err != nil { + return nil, false, err, true + } + + off := FrameHeaderSize + ridBytes + var offsets [MaxFrameK + 1]uint32 + prev := uint32(0) + for i := 0; i < k+1; i++ { + cur := binary.LittleEndian.Uint32(payload[off : off+4]) + if cur < prev { + return nil, false, ErrCorrupt, true + } + offsets[i] = cur + prev = cur + off += 4 + } + rawLen := offsets[k] + if limits.MaxRecordSize > 0 && int64(rawLen) > limits.MaxRecordSize { + return nil, false, ErrRecordTooLarge, true + } + valStart := offsets[subIndex] + valEnd := offsets[subIndex+1] + if valEnd < valStart || valEnd > rawLen { + return nil, false, ErrCorrupt, true + } + + frame := FrameHeader{ + Version: frameHeader[0], + Flags: frameHeader[1], + K: uint8(k), + Reserved: frameHeader[3], + DictID: binary.LittleEndian.Uint64(frameHeader[4:12]), + } + + raw := f.takeDecodeScratch(int(rawLen)) + pooledRaw := true + raw, err := decodeFramePayloadTo(frame, payload[prefixLen:], f.dictLookup, rawLen, raw) + if err != nil { + if pooledRaw { + f.releaseDecodeScratch(raw) + } + return nil, false, err, true + } + if uint32(len(raw)) != rawLen { + if pooledRaw { + f.releaseDecodeScratch(raw) + } + return nil, false, ErrCorrupt, true + } + cachedRaw := f.groupedFrameCacheStore(start, false, k, offsets, raw, true) + + val := raw[valStart:valEnd] + if f.templateLookup != nil && templ.IsEncodedPayload(val) { + decoded, err := templ.DecodePayloadAppend(nil, val, func(id uint64) (templ.TemplateDef, error) { + return resolveTemplateDef(id, f.templateLookup, f.templateDefCache) + }, f.templateDecodeOpts) + if pooledRaw && !cachedRaw { + f.releaseDecodeScratch(raw) + } + if err != nil { + return nil, false, err, true + } + return decoded, false, nil, true + } + + if dst != nil && cap(dst) >= len(val) { + out := dst[:len(val)] + copy(out, val) + if pooledRaw && !cachedRaw { + f.releaseDecodeScratch(raw) + } + return out, true, nil, true + } + out := make([]byte, len(val)) + copy(out, val) + if pooledRaw && !cachedRaw { + f.releaseDecodeScratch(raw) + } + return out, false, nil, true +} + func (f *File) ReadAppend(ptr page.ValuePtr, verifyCRC bool, dst []byte) ([]byte, error) { if f == nil || f.File == nil { return nil, errors.New("valuelog: nil file") diff --git a/TreeDB/internal/valuelog/valuelog_test.go b/TreeDB/internal/valuelog/valuelog_test.go index b974c59ff..96ade0364 100644 --- a/TreeDB/internal/valuelog/valuelog_test.go +++ b/TreeDB/internal/valuelog/valuelog_test.go @@ -802,6 +802,92 @@ func TestValueLogManager_GroupedFrameCache_MaxRawBytesSkipsOversize(t *testing.T } } +func TestValueLogManager_ReadUnsafeTo_CompressedGroupedFallbackUsesCache(t *testing.T) { + if runtime.GOOS == "windows" { + t.Skip("mmap not supported on windows") + } + + // Force file-read fallback so this test exercises the non-mmap path. + withMappedSealedBudget(t, 0) + + dir := t.TempDir() + fileID, err := EncodeFileID(0, 1) + if err != nil { + t.Fatalf("encode file id: %v", err) + } + path := filepath.Join(dir, "value-l0-000001.log") + + writer, err := NewWriter(path, fileID) + if err != nil { + t.Fatalf("new writer: %v", err) + } + writer.SetBlockCompression(BlockCodecSnappy, true) + ptrs, want := appendCompressedFrameForCacheTests(t, writer, 0, 4) + if err := writer.Close(); err != nil { + t.Fatalf("close: %v", err) + } + + m, err := NewManager(dir) + if err != nil { + t.Fatalf("new manager: %v", err) + } + defer func() { _ = m.Close() }() + m.SetDisableReadChecksum(true) + m.SetGroupedFrameCacheEntries(4) + + f := m.files[fileID] + if f == nil { + t.Fatalf("missing opened file for id=%d", fileID) + } + + dst := make([]byte, 0, 512) + got0, used0, err := m.ReadUnsafeTo(ptrs[0], dst[:0]) + if err != nil { + t.Fatalf("read unsafe to first: %v", err) + } + if !used0 { + t.Fatalf("expected first read to use dst") + } + if !bytes.Equal(got0, want[0]) { + t.Fatalf("first value mismatch: got=%q want=%q", got0, want[0]) + } + + hits0, misses0, entries0, _ := f.groupedFrameCacheStats() + if misses0 == 0 { + t.Fatalf("expected first compressed grouped read to miss cache") + } + if entries0 == 0 { + t.Fatalf("expected first compressed grouped read to populate cache") + } + + got1, used1, err := m.ReadUnsafeTo(ptrs[1], dst[:0]) + if err != nil { + t.Fatalf("read unsafe to second: %v", err) + } + if !used1 { + t.Fatalf("expected second read to use dst") + } + if !bytes.Equal(got1, want[1]) { + t.Fatalf("second value mismatch: got=%q want=%q", got1, want[1]) + } + + hits1, misses1, entries1, _ := f.groupedFrameCacheStats() + if hits1 <= hits0 { + t.Fatalf("expected second read to hit grouped cache: hits before=%d after=%d", hits0, hits1) + } + if misses1 != misses0 { + t.Fatalf("unexpected cache miss increase on second read: before=%d after=%d", misses0, misses1) + } + if entries1 == 0 { + t.Fatalf("expected grouped cache entries to remain populated") + } + + _, _, missNoMapping, _, fallbacks := m.MmapReadStats() + if missNoMapping == 0 || fallbacks == 0 { + t.Fatalf("expected fallback path stats to reflect no-mmap reads: miss_no_mapping=%d fallbacks=%d", missNoMapping, fallbacks) + } +} + func TestReadAtGroupedFastPathWithoutChecksum(t *testing.T) { dir := t.TempDir() path := filepath.Join(dir, "value-000001.log") From 8afac0815a0def682c3e59141d1fd832e15f64a5 Mon Sep 17 00:00:00 2001 From: Mikers Date: Sat, 28 Mar 2026 20:21:34 -1000 Subject: [PATCH 52/61] unified-bench: print gc observed-source stats --- cmd/unified_bench/main.go | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/cmd/unified_bench/main.go b/cmd/unified_bench/main.go index 76053da17..51f4c76f7 100644 --- a/cmd/unified_bench/main.go +++ b/cmd/unified_bench/main.go @@ -1355,6 +1355,18 @@ func printTreeDBCacheStats(w io.Writer, inst *DBInstance, prefix string) { "treedb.cache.vlog_generation.rewrite.runs", "treedb.cache.vlog_generation.gc.deleted_segments", "treedb.cache.vlog_generation.gc.deleted_bytes", + "treedb.cache.vlog_generation.gc.last_observed_source.segments", + "treedb.cache.vlog_generation.gc.last_observed_source.segments_referenced", + "treedb.cache.vlog_generation.gc.last_observed_source.segments_eligible", + "treedb.cache.vlog_generation.gc.last_observed_source.segments_pending", + "treedb.cache.vlog_generation.gc.last_observed_source.segments_protected_retained", + "treedb.cache.vlog_generation.gc.last_observed_source.segments_protected_in_use", + "treedb.cache.vlog_generation.gc.last_observed_source.bytes", + "treedb.cache.vlog_generation.gc.last_observed_source.bytes_referenced", + "treedb.cache.vlog_generation.gc.last_observed_source.bytes_eligible", + "treedb.cache.vlog_generation.gc.last_observed_source.bytes_pending", + "treedb.cache.vlog_generation.gc.last_observed_source.bytes_protected_retained", + "treedb.cache.vlog_generation.gc.last_observed_source.bytes_protected_in_use", "treedb.cache.vlog_generation.gc.runs", "treedb.cache.vlog_retained_prune.runs", "treedb.cache.vlog_retained_prune.forced_runs", From 71fc32a6bbe9cb0f497762038748fbf2bfd8198b Mon Sep 17 00:00:00 2001 From: Mikers Date: Sun, 29 Mar 2026 00:31:05 -1000 Subject: [PATCH 53/61] bench: harden run_celestia AB loop against stuck outliers --- docs/benchmarks/CELESTIA_COMPRESSION_LOOP.md | 11 + scripts/run_celestia_ab.sh | 302 +++++++++++++++---- 2 files changed, 254 insertions(+), 59 deletions(-) diff --git a/docs/benchmarks/CELESTIA_COMPRESSION_LOOP.md b/docs/benchmarks/CELESTIA_COMPRESSION_LOOP.md index 25ba010c0..92dc8875b 100644 --- a/docs/benchmarks/CELESTIA_COMPRESSION_LOOP.md +++ b/docs/benchmarks/CELESTIA_COMPRESSION_LOOP.md @@ -87,6 +87,9 @@ Now includes anti-loop safeguards: - clear stop (improvement/regression) - futility stop (`futile_remaining_pairs`) - low-signal neutral-streak stop (`low_signal_neutral_streak`) +- strict new-run-home detection (no fallback to old run dirs) +- per-variant timeout/retry for stuck syncs +- invalid-pair streak stop (`invalid_pair_streak`) Example: @@ -97,10 +100,18 @@ CLEAR_WIN_PAIRS=2 \ CLEAR_LOSS_PAIRS=2 \ LOW_SIGNAL_MIN_PAIRS=3 \ LOW_SIGNAL_NEUTRAL_STREAK=3 \ +RUN_TIMEOUT_SECONDS=1800 \ +RUN_MAX_ATTEMPTS_PER_VARIANT=2 \ +RUN_RETRY_SLEEP_SECONDS=20 \ +INVALID_PAIR_STREAK_STOP=2 \ REWRITE_ENABLED=1 \ ./scripts/run_celestia_ab.sh ``` +Notes: +- Pair execution remains strictly single-run at a time and interleaved by pair order. +- Invalid runs (timeout, launcher failure, missing new run home, rewrite failure) are recorded but excluded from pair scoring. + ## Process Review Cadence Review and revise the loop after every decision event: diff --git a/scripts/run_celestia_ab.sh b/scripts/run_celestia_ab.sh index 63321696d..eadfd0f9a 100755 --- a/scripts/run_celestia_ab.sh +++ b/scripts/run_celestia_ab.sh @@ -19,6 +19,10 @@ STOP_ON_CLEAR="${STOP_ON_CLEAR:-1}" SLEEP_BETWEEN_RUNS_SECONDS="${SLEEP_BETWEEN_RUNS_SECONDS:-5}" LOW_SIGNAL_MIN_PAIRS="${LOW_SIGNAL_MIN_PAIRS:-3}" LOW_SIGNAL_NEUTRAL_STREAK="${LOW_SIGNAL_NEUTRAL_STREAK:-3}" +RUN_TIMEOUT_SECONDS="${RUN_TIMEOUT_SECONDS:-1800}" +RUN_MAX_ATTEMPTS_PER_VARIANT="${RUN_MAX_ATTEMPTS_PER_VARIANT:-2}" +RUN_RETRY_SLEEP_SECONDS="${RUN_RETRY_SLEEP_SECONDS:-20}" +INVALID_PAIR_STREAK_STOP="${INVALID_PAIR_STREAK_STOP:-2}" TS="$(date +%Y%m%d%H%M%S)" OUT="${OUT_DIR:-$ROOT/artifacts/celestia_ab/$TS}" @@ -34,6 +38,22 @@ if [[ "$MAX_PAIRS" -lt 1 ]]; then echo "MAX_PAIRS must be >= 1" >&2 exit 1 fi +if [[ "$RUN_TIMEOUT_SECONDS" -lt 0 ]]; then + echo "RUN_TIMEOUT_SECONDS must be >= 0" >&2 + exit 1 +fi +if [[ "$RUN_MAX_ATTEMPTS_PER_VARIANT" -lt 1 ]]; then + echo "RUN_MAX_ATTEMPTS_PER_VARIANT must be >= 1" >&2 + exit 1 +fi +if [[ "$RUN_RETRY_SLEEP_SECONDS" -lt 0 ]]; then + echo "RUN_RETRY_SLEEP_SECONDS must be >= 0" >&2 + exit 1 +fi +if [[ "$INVALID_PAIR_STREAK_STOP" -lt 1 ]]; then + echo "INVALID_PAIR_STREAK_STOP must be >= 1" >&2 + exit 1 +fi mkdir -p "$OUT/runs" @@ -55,6 +75,10 @@ stop_on_clear=$STOP_ON_CLEAR sleep_between_runs_seconds=$SLEEP_BETWEEN_RUNS_SECONDS low_signal_min_pairs=$LOW_SIGNAL_MIN_PAIRS low_signal_neutral_streak=$LOW_SIGNAL_NEUTRAL_STREAK +run_timeout_seconds=$RUN_TIMEOUT_SECONDS +run_max_attempts_per_variant=$RUN_MAX_ATTEMPTS_PER_VARIANT +run_retry_sleep_seconds=$RUN_RETRY_SLEEP_SECONDS +invalid_pair_streak_stop=$INVALID_PAIR_STREAK_STOP META list_run_homes() { @@ -90,8 +114,7 @@ detect_new_run_home() { return 0 fi done < <(list_run_homes) - - list_run_homes | head -n 1 + return 1 } run_variant() { @@ -104,47 +127,86 @@ run_variant() { local run_dir="$OUT/runs/$run_id" mkdir -p "$run_dir" - local before_file="$run_dir/before_homes.txt" - list_run_homes >"$before_file" - - local run_start - run_start=$(date +%s) - ( - set -euo pipefail - if [[ -n "$env_file" ]]; then - # shellcheck source=/dev/null - set -a - source "$env_file" - set +a + local run_home="" + local app_db="" + local run_start=0 + local run_end=0 + local run_rc=0 + local attempt_used=0 + local invalid_reason="" + local pre_app_bytes=0 + local pre_wal_bytes=0 + local post_app_bytes=0 + local post_wal_bytes=0 + local rewrite_attempted=0 + local rewrite_seconds=0 + local rewrite_rc=0 + local analyze_json="$run_dir/maintenance.json" + rm -f "$analyze_json" + : >"$run_dir/attempts.log" + + local attempt + for ((attempt = 1; attempt <= RUN_MAX_ATTEMPTS_PER_VARIANT; attempt++)); do + attempt_used="$attempt" + local attempt_dir="$run_dir/attempt_${attempt}" + mkdir -p "$attempt_dir" + + local before_file="$attempt_dir/before_homes.txt" + list_run_homes >"$before_file" + + run_start=$(date +%s) + set +e + ( + set -euo pipefail + if [[ -n "$env_file" ]]; then + # shellcheck source=/dev/null + set -a + source "$env_file" + set +a + fi + # Non-login shell avoids user profile side effects (e.g. tty-dependent exports) + # that can fail under nohup/background runs. + if [[ "$RUN_TIMEOUT_SECONDS" -gt 0 ]] && command -v timeout >/dev/null 2>&1; then + timeout --signal=TERM --kill-after=60 "${RUN_TIMEOUT_SECONDS}s" bash -c "$RUN_CMD" + else + bash -c "$RUN_CMD" + fi + ) >"$attempt_dir/launcher.log" 2>&1 + run_rc=$? + set -e + cp "$attempt_dir/launcher.log" "$run_dir/launcher.log" + run_end=$(date +%s) + + run_home="$(detect_new_run_home "$before_file" || true)" + invalid_reason="" + if [[ "$run_rc" -eq 124 || "$run_rc" -eq 137 || "$run_rc" -eq 143 ]]; then + invalid_reason="run_timeout" + elif [[ "$run_rc" -ne 0 ]]; then + invalid_reason="run_cmd_failed" + elif [[ -z "$run_home" || ! -d "$run_home" ]]; then + invalid_reason="run_home_missing" fi - # Non-login shell avoids user profile side effects (e.g. tty-dependent exports) - # that can fail under nohup/background runs. - bash -c "$RUN_CMD" - ) >"$run_dir/launcher.log" 2>&1 - local run_end - run_end=$(date +%s) - - local run_home - run_home="$(detect_new_run_home "$before_file")" - if [[ -z "$run_home" || ! -d "$run_home" ]]; then - echo "failed to detect run home for $run_id" >&2 - exit 1 - fi - local app_db="$run_home/data/application.db" - local pre_app_bytes pre_wal_bytes - pre_app_bytes="$(du_bytes "$app_db")" - pre_wal_bytes="$(du_bytes "$app_db/maindb/wal")" + echo "attempt=$attempt run_exit_code=$run_rc invalid_reason=${invalid_reason:-none} run_home=${run_home:-}" >>"$run_dir/attempts.log" + if [[ -z "$invalid_reason" ]]; then + break + fi + if (( attempt < RUN_MAX_ATTEMPTS_PER_VARIANT )); then + sleep "$RUN_RETRY_SLEEP_SECONDS" + fi + done + + if [[ -n "$run_home" && -d "$run_home" ]]; then + app_db="$run_home/data/application.db" + pre_app_bytes="$(du_bytes "$app_db")" + pre_wal_bytes="$(du_bytes "$app_db/maindb/wal")" - local analyze_json="$run_dir/maintenance.json" - if ! "$ANALYZER" --json "$run_home" >"$analyze_json" 2>"$run_dir/analyze.stderr.log"; then - rm -f "$analyze_json" + if ! "$ANALYZER" --json "$run_home" >"$analyze_json" 2>"$run_dir/analyze.stderr.log"; then + rm -f "$analyze_json" + fi fi - local rewrite_attempted=0 - local rewrite_seconds=0 - local rewrite_rc=0 - if [[ "$REWRITE_ENABLED" == "1" && -x "$TREEMAP_BIN" && -d "$app_db" ]]; then + if [[ -z "$invalid_reason" && "$REWRITE_ENABLED" == "1" && -x "$TREEMAP_BIN" && -n "$app_db" && -d "$app_db" ]]; then rewrite_attempted=1 local rewrite_start rewrite_start=$(date +%s) @@ -155,21 +217,23 @@ run_variant() { local rewrite_end rewrite_end=$(date +%s) rewrite_seconds=$((rewrite_end - rewrite_start)) - else - rewrite_rc=0 + if [[ "$rewrite_rc" -ne 0 ]]; then + invalid_reason="rewrite_failed" + fi fi - local post_app_bytes post_wal_bytes - post_app_bytes="$(du_bytes "$app_db")" - post_wal_bytes="$(du_bytes "$app_db/maindb/wal")" + if [[ -n "$app_db" ]]; then + post_app_bytes="$(du_bytes "$app_db")" + post_wal_bytes="$(du_bytes "$app_db/maindb/wal")" + fi local run_json="$run_dir/run.json" - python3 - "$run_home" "$run_json" "$variant" "$pair_index" "$run_start" "$run_end" "$rewrite_attempted" "$rewrite_seconds" "$rewrite_rc" "$pre_app_bytes" "$pre_wal_bytes" "$post_app_bytes" "$post_wal_bytes" "$analyze_json" <<'PY' + python3 - "$run_home" "$run_json" "$variant" "$pair_index" "$run_start" "$run_end" "$rewrite_attempted" "$rewrite_seconds" "$rewrite_rc" "$pre_app_bytes" "$pre_wal_bytes" "$post_app_bytes" "$post_wal_bytes" "$analyze_json" "$invalid_reason" "$run_rc" "$attempt_used" "$RUN_MAX_ATTEMPTS_PER_VARIANT" "$RUN_TIMEOUT_SECONDS" <<'PY' import json import sys from pathlib import Path -run_home = Path(sys.argv[1]) +run_home_raw = sys.argv[1] out_path = Path(sys.argv[2]) variant = sys.argv[3] pair_index = int(sys.argv[4]) @@ -183,6 +247,12 @@ pre_wal_bytes = int(sys.argv[11]) post_app_bytes = int(sys.argv[12]) post_wal_bytes = int(sys.argv[13]) analyze_json_path = Path(sys.argv[14]) +invalid_reason = str(sys.argv[15]).strip() +run_exit_code = int(sys.argv[16]) +attempt = int(sys.argv[17]) +max_attempts = int(sys.argv[18]) +run_timeout_seconds = int(sys.argv[19]) +run_home = Path(run_home_raw) if run_home_raw else None def parse_sync_time(path: Path) -> dict[str, str]: out: dict[str, str] = {} @@ -210,7 +280,8 @@ def safe_int(raw: str | None, default: int = 0) -> int: except Exception: return default -sync = parse_sync_time(run_home / "sync" / "sync-time.log") +sync_path = run_home / "sync" / "sync-time.log" if run_home is not None else None +sync = parse_sync_time(sync_path) if sync_path is not None else {} maintenance = {} if analyze_json_path.is_file(): try: @@ -224,15 +295,25 @@ if analyze_json_path.is_file(): t_sync = safe_int(sync.get("duration_seconds"), max(0, run_end - run_start)) t_rw = rewrite_seconds if rewrite_attempted == 1 else 0 -if rewrite_attempted == 1 and rewrite_rc != 0: - t_total = None -else: - t_total = t_sync + t_rw +resolved_invalid_reason = invalid_reason +if not resolved_invalid_reason and rewrite_attempted == 1 and rewrite_rc != 0: + resolved_invalid_reason = "rewrite_failed" +valid = resolved_invalid_reason == "" +t_total = (t_sync + t_rw) if valid else None result = { "pair_index": pair_index, "variant": variant, - "run_home": str(run_home), + "run_home": run_home_raw, + "status": { + "valid": valid, + "invalid_reason": resolved_invalid_reason, + "run_exit_code": run_exit_code, + "attempt": attempt, + "max_attempts": max_attempts, + "run_timeout_seconds": run_timeout_seconds, + "sync_time_present": sync_path.is_file() if sync_path is not None else False, + }, "sync": { "duration_seconds": t_sync, "max_rss_kb": safe_int(sync.get("max_rss_kb"), 0), @@ -268,12 +349,16 @@ out_path.write_text(json.dumps(result, indent=2, sort_keys=True), encoding="utf- print(out_path) PY - echo "run_id=$run_id run_home=$run_home json=$run_json" + local run_valid="false" + if [[ -z "$invalid_reason" ]]; then + run_valid="true" + fi + echo "run_id=$run_id run_home=${run_home:-} valid=$run_valid invalid_reason=${invalid_reason:-none} attempts=$attempt_used/$RUN_MAX_ATTEMPTS_PER_VARIANT json=$run_json" } aggregate_and_decide() { local decision_json="$OUT/decision.json" - python3 - "$OUT" "$SIZE_TOLERANCE_BYTES" "$TIME_TOLERANCE_SECONDS" "$MIN_PAIRS" "$CLEAR_WIN_PAIRS" "$CLEAR_LOSS_PAIRS" "$MAX_PAIRS" "$STOP_ON_CLEAR" "$LOW_SIGNAL_MIN_PAIRS" "$LOW_SIGNAL_NEUTRAL_STREAK" "$decision_json" <<'PY' + python3 - "$OUT" "$SIZE_TOLERANCE_BYTES" "$TIME_TOLERANCE_SECONDS" "$MIN_PAIRS" "$CLEAR_WIN_PAIRS" "$CLEAR_LOSS_PAIRS" "$MAX_PAIRS" "$STOP_ON_CLEAR" "$LOW_SIGNAL_MIN_PAIRS" "$LOW_SIGNAL_NEUTRAL_STREAK" "$INVALID_PAIR_STREAK_STOP" "$decision_json" <<'PY' import csv import json import sys @@ -289,7 +374,8 @@ max_pairs = int(sys.argv[7]) stop_on_clear = sys.argv[8] == "1" low_signal_min_pairs = int(sys.argv[9]) low_signal_neutral_streak = int(sys.argv[10]) -decision_path = Path(sys.argv[11]) +invalid_pair_streak_stop = int(sys.argv[11]) +decision_path = Path(sys.argv[12]) run_files = sorted(out.glob("runs/*/run.json")) runs = [] @@ -301,6 +387,39 @@ for p in run_files: runs.sort(key=lambda r: (int(r.get("pair_index", 0)), str(r.get("variant", "")))) +def run_is_valid(run: dict) -> bool: + status = run.get("status") + if isinstance(status, dict) and "valid" in status: + return bool(status.get("valid")) + metrics = run.get("metrics", {}) or {} + rewrite = run.get("rewrite", {}) or {} + return metrics.get("t_total_seconds") is not None and int(rewrite.get("exit_code", 0)) == 0 + +def run_invalid_reason(run: dict) -> str: + status = run.get("status") + if isinstance(status, dict): + return str(status.get("invalid_reason", "") or "") + return "" + +def run_attempt(run: dict): + status = run.get("status") + if isinstance(status, dict): + return status.get("attempt") + return None + +def run_max_attempts(run: dict): + status = run.get("status") + if isinstance(status, dict): + return status.get("max_attempts") + return None + +def run_exit_code(run: dict): + status = run.get("status") + if isinstance(status, dict) and status.get("run_exit_code") is not None: + return status.get("run_exit_code") + rewrite = run.get("rewrite", {}) or {} + return rewrite.get("exit_code") + runs_csv = out / "runs.csv" with runs_csv.open("w", newline="", encoding="utf-8") as fh: w = csv.writer(fh) @@ -316,6 +435,11 @@ with runs_csv.open("w", newline="", encoding="utf-8") as fh: "s_post_app_bytes", "s_post_wal_bytes", "max_rss_kb", + "valid", + "invalid_reason", + "run_exit_code", + "run_attempt", + "run_max_attempts", "rewrite_exit_code", "rewrite_runs", "gc_runs", @@ -327,6 +451,7 @@ with runs_csv.open("w", newline="", encoding="utf-8") as fh: s = r.get("sizes", {}) or {} rw = r.get("rewrite", {}) or {} summary = r.get("maintenance_summary", {}) or {} + valid = run_is_valid(r) w.writerow([ int(r.get("pair_index", 0)), str(r.get("variant", "")), @@ -339,6 +464,11 @@ with runs_csv.open("w", newline="", encoding="utf-8") as fh: s.get("post_app_bytes"), s.get("post_wal_bytes"), m.get("max_rss_kb"), + valid, + run_invalid_reason(r), + run_exit_code(r), + run_attempt(r), + run_max_attempts(r), rw.get("exit_code"), summary.get("rewrite_runs", 0), summary.get("gc_runs", 0), @@ -354,12 +484,34 @@ for r in runs: pair_rows = [] wins = 0 losses = 0 +raw_pairs = 0 +invalid_pairs = 0 for pair in sorted(by_pair): row = by_pair[pair] ctrl = row.get("control") cand = row.get("candidate") if not ctrl or not cand: continue + raw_pairs += 1 + ctrl_valid = run_is_valid(ctrl) + cand_valid = run_is_valid(cand) + ctrl_reason = run_invalid_reason(ctrl) + cand_reason = run_invalid_reason(cand) + if not ctrl_valid or not cand_valid: + invalid_pairs += 1 + pair_rows.append({ + "pair_index": pair, + "delta_t_sync_seconds": None, + "delta_t_total_seconds": None, + "delta_s_sync_app_bytes": None, + "delta_s_post_wal_bytes": None, + "control_valid": ctrl_valid, + "candidate_valid": cand_valid, + "control_invalid_reason": ctrl_reason, + "candidate_invalid_reason": cand_reason, + "outcome": "invalid", + }) + continue cm = cand.get("metrics", {}) or {} bm = ctrl.get("metrics", {}) or {} cand_total = cm.get("t_total_seconds") @@ -398,6 +550,10 @@ for pair in sorted(by_pair): "delta_t_total_seconds": d_total, "delta_s_sync_app_bytes": d_sync_app, "delta_s_post_wal_bytes": d_post_wal, + "control_valid": ctrl_valid, + "candidate_valid": cand_valid, + "control_invalid_reason": ctrl_reason, + "candidate_invalid_reason": cand_reason, "outcome": outcome, }) @@ -410,6 +566,10 @@ with pairs_csv.open("w", newline="", encoding="utf-8") as fh: "delta_t_total_seconds", "delta_s_sync_app_bytes", "delta_s_post_wal_bytes", + "control_valid", + "candidate_valid", + "control_invalid_reason", + "candidate_invalid_reason", "outcome", ]) for r in pair_rows: @@ -419,17 +579,28 @@ with pairs_csv.open("w", newline="", encoding="utf-8") as fh: r["delta_t_total_seconds"], r["delta_s_sync_app_bytes"], r["delta_s_post_wal_bytes"], + r["control_valid"], + r["candidate_valid"], + r["control_invalid_reason"], + r["candidate_invalid_reason"], r["outcome"], ]) -completed_pairs = len(pair_rows) +scored_rows = [row for row in pair_rows if row.get("outcome") != "invalid"] +completed_pairs = len(scored_rows) neutral = max(0, completed_pairs - wins - losses) neutral_streak = 0 -for row in reversed(pair_rows): +for row in reversed(scored_rows): if row.get("outcome") == "neutral": neutral_streak += 1 continue break +invalid_streak = 0 +for row in reversed(pair_rows): + if row.get("outcome") == "invalid": + invalid_streak += 1 + continue + break reason = "continue" stop = False @@ -441,7 +612,7 @@ if stop_on_clear and completed_pairs >= min_pairs: stop = True reason = "clear_regression" else: - remaining = max(0, max_pairs - completed_pairs) + remaining = max(0, max_pairs - raw_pairs) can_reach_clear_win = (wins + remaining) >= clear_win_pairs can_reach_clear_loss = (losses + remaining) >= clear_loss_pairs if not can_reach_clear_win and not can_reach_clear_loss: @@ -452,7 +623,11 @@ if (not stop) and completed_pairs >= low_signal_min_pairs and neutral_streak >= stop = True reason = "low_signal_neutral_streak" -if (not stop) and completed_pairs >= max_pairs: +if (not stop) and invalid_streak >= invalid_pair_streak_stop: + stop = True + reason = "invalid_pair_streak" + +if (not stop) and raw_pairs >= max_pairs: stop = True reason = "max_pairs" @@ -460,13 +635,17 @@ summary_md = out / "summary.md" lines = [] lines.append("# run_celestia A/B summary") lines.append("") -lines.append(f"- completed pairs: `{completed_pairs}`") +lines.append(f"- observed pairs: `{raw_pairs}`") +lines.append(f"- scored pairs: `{completed_pairs}`") +lines.append(f"- invalid pairs skipped: `{invalid_pairs}`") lines.append(f"- wins/losses/neutral: `{wins}` / `{losses}` / `{neutral}`") lines.append(f"- neutral streak (tail): `{neutral_streak}`") +lines.append(f"- invalid streak (tail): `{invalid_streak}`") lines.append(f"- size tolerance bytes: `{size_tol}`") lines.append(f"- time tolerance seconds: `{time_tol}`") lines.append(f"- low-signal min pairs: `{low_signal_min_pairs}`") lines.append(f"- low-signal neutral streak: `{low_signal_neutral_streak}`") +lines.append(f"- invalid pair streak stop: `{invalid_pair_streak_stop}`") lines.append(f"- decision: `{reason}`") lines.append("") lines.append("## Artifacts") @@ -480,6 +659,8 @@ if pair_rows: lines.append("## Last Pair") lines.append("") lines.append(f"- pair: `{last['pair_index']}` outcome=`{last['outcome']}`") + lines.append(f"- control_valid: `{last['control_valid']}` reason=`{last['control_invalid_reason']}`") + lines.append(f"- candidate_valid: `{last['candidate_valid']}` reason=`{last['candidate_invalid_reason']}`") lines.append(f"- delta_t_sync_seconds: `{last['delta_t_sync_seconds']}`") lines.append(f"- delta_t_total_seconds: `{last['delta_t_total_seconds']}`") lines.append(f"- delta_s_sync_app_bytes: `{last['delta_s_sync_app_bytes']}`") @@ -487,11 +668,14 @@ if pair_rows: summary_md.write_text("\n".join(lines) + "\n", encoding="utf-8") payload = { + "observed_pairs": raw_pairs, "completed_pairs": completed_pairs, + "invalid_pairs": invalid_pairs, "wins": wins, "losses": losses, "neutral": neutral, "neutral_streak": neutral_streak, + "invalid_streak": invalid_streak, "stop": stop, "reason": reason, } From 1ba077e49a36ebd5362b1736137687e8230cbbea Mon Sep 17 00:00:00 2001 From: Mikers Date: Sun, 29 Mar 2026 01:26:37 -1000 Subject: [PATCH 54/61] bench: enrich celestia timeout sync probe totals --- docs/benchmarks/CELESTIA_COMPRESSION_LOOP.md | 1 + scripts/run_celestia_ab.sh | 48 ++++++++++++++++++++ 2 files changed, 49 insertions(+) diff --git a/docs/benchmarks/CELESTIA_COMPRESSION_LOOP.md b/docs/benchmarks/CELESTIA_COMPRESSION_LOOP.md index 92dc8875b..7b0836451 100644 --- a/docs/benchmarks/CELESTIA_COMPRESSION_LOOP.md +++ b/docs/benchmarks/CELESTIA_COMPRESSION_LOOP.md @@ -111,6 +111,7 @@ REWRITE_ENABLED=1 \ Notes: - Pair execution remains strictly single-run at a time and interleaved by pair order. - Invalid runs (timeout, launcher failure, missing new run home, rewrite failure) are recorded but excluded from pair scoring. +- Per-run `run.json` now includes `status.sync_probe` (last snapshot chunk, last and max snapshot totals, fetch event count, state-sync-complete flag) for timeout forensics. ## Process Review Cadence diff --git a/scripts/run_celestia_ab.sh b/scripts/run_celestia_ab.sh index eadfd0f9a..ef8c3f088 100755 --- a/scripts/run_celestia_ab.sh +++ b/scripts/run_celestia_ab.sh @@ -230,6 +230,7 @@ run_variant() { local run_json="$run_dir/run.json" python3 - "$run_home" "$run_json" "$variant" "$pair_index" "$run_start" "$run_end" "$rewrite_attempted" "$rewrite_seconds" "$rewrite_rc" "$pre_app_bytes" "$pre_wal_bytes" "$post_app_bytes" "$post_wal_bytes" "$analyze_json" "$invalid_reason" "$run_rc" "$attempt_used" "$RUN_MAX_ATTEMPTS_PER_VARIANT" "$RUN_TIMEOUT_SECONDS" <<'PY' import json +import re import sys from pathlib import Path @@ -280,8 +281,54 @@ def safe_int(raw: str | None, default: int = 0) -> int: except Exception: return default +def probe_sync_progress(node_log_path: Path | None) -> dict[str, object]: + progress = { + "node_log_present": False, + "last_snapshot_chunk": 0, + "last_snapshot_total": 0, + "last_nonzero_snapshot_total": 0, + "max_snapshot_total": 0, + "snapshot_fetch_events": 0, + "state_sync_complete": False, + } + if node_log_path is None or not node_log_path.is_file(): + return progress + + progress["node_log_present"] = True + try: + text = node_log_path.read_text(encoding="utf-8", errors="replace") + except Exception: + return progress + + last_chunk = 0 + last_total = 0 + last_nonzero_total = 0 + max_total = 0 + events = 0 + for m in re.finditer(r"Fetching snapshot chunk chunk=(\d+).*total=(\d+)", text): + events += 1 + try: + last_chunk = int(m.group(1)) + last_total = int(m.group(2)) + if last_total > 0: + last_nonzero_total = last_total + if last_total > max_total: + max_total = last_total + except Exception: + continue + + progress["last_snapshot_chunk"] = last_chunk + progress["last_snapshot_total"] = last_total + progress["last_nonzero_snapshot_total"] = last_nonzero_total + progress["max_snapshot_total"] = max_total + progress["snapshot_fetch_events"] = events + progress["state_sync_complete"] = ("State sync complete" in text) or ("statesync complete" in text.lower()) + return progress + sync_path = run_home / "sync" / "sync-time.log" if run_home is not None else None sync = parse_sync_time(sync_path) if sync_path is not None else {} +node_log_path = run_home / "sync" / "node.log" if run_home is not None else None +sync_probe = probe_sync_progress(node_log_path) maintenance = {} if analyze_json_path.is_file(): try: @@ -313,6 +360,7 @@ result = { "max_attempts": max_attempts, "run_timeout_seconds": run_timeout_seconds, "sync_time_present": sync_path.is_file() if sync_path is not None else False, + "sync_probe": sync_probe, }, "sync": { "duration_seconds": t_sync, From 3c29b271da64d9a9f6cb6a2840d2708275128f55 Mon Sep 17 00:00:00 2001 From: Mikers Date: Sun, 29 Mar 2026 08:20:30 -1000 Subject: [PATCH 55/61] treedb: cut rewrite alloc churn and sharpen bench signal --- TreeDB/db/vlog_health.go | 57 +++- TreeDB/db/vlog_rewrite.go | 363 +++++++++++++++++------ TreeDB/db/vlog_rewrite_bench_test.go | 424 +++++++++++++++++++++++++++ 3 files changed, 758 insertions(+), 86 deletions(-) create mode 100644 TreeDB/db/vlog_rewrite_bench_test.go diff --git a/TreeDB/db/vlog_health.go b/TreeDB/db/vlog_health.go index fb2a1d51f..5117b1d78 100644 --- a/TreeDB/db/vlog_health.go +++ b/TreeDB/db/vlog_health.go @@ -95,6 +95,30 @@ func segmentAgeSeconds(path string, now time.Time) int64 { return int64(age / time.Second) } +func advanceSegmentAgeSeconds(h valueLogSegmentHealth, now time.Time) int64 { + age := h.AgeSeconds + if age < 0 { + age = 0 + } + if h.LastUpdatedUnixNano <= 0 { + return age + } + prevSec := h.LastUpdatedUnixNano / int64(time.Second) + nowSec := now.Unix() + if nowSec <= prevSec { + return age + } + delta := nowSec - prevSec + if delta <= 0 { + return age + } + // Clamp on overflow to preserve monotonic, bounded metadata. + if age > int64(^uint64(0)>>1)-delta { + return int64(^uint64(0) >> 1) + } + return age + delta +} + func updateValueLogHealthAfterGC(dbDir string, set *valuelog.Set, referenced map[uint32]struct{}) error { path := valueLogHealthPath(dbDir) health, err := loadValueLogHealth(path) @@ -120,7 +144,10 @@ func updateValueLogHealthAfterGC(dbDir string, set *valuelog.Set, referenced map } else if size > 0 && h.LiveBytes > size { h.LiveBytes = size } - h.AgeSeconds = segmentAgeSeconds(f.Path, now) + // Avoid per-segment stat calls on the GC fast path; preserve age via + // monotonic last-update deltas and refresh from disk only in fallback + // scans below. + h.AgeSeconds = advanceSegmentAgeSeconds(h, now) h.LastUpdatedUnixNano = now.UnixNano() health[id] = h } @@ -163,7 +190,7 @@ func updateValueLogHealthAfterGC(dbDir string, set *valuelog.Set, referenced map return saveValueLogHealth(path, health) } -func updateValueLogHealthAfterRewrite(dbDir string, oldValueIDs map[uint32]struct{}) error { +func updateValueLogHealthAfterRewrite(dbDir string, oldValueIDs map[uint32]struct{}, set *valuelog.Set) error { path := valueLogHealthPath(dbDir) health, err := loadValueLogHealth(path) if err != nil { @@ -177,6 +204,32 @@ func updateValueLogHealthAfterRewrite(dbDir string, oldValueIDs map[uint32]struc } } + // Online rewrite callers can provide a current manager set and avoid an + // expensive directory rescan on the hot path. + if set != nil { + out := make(map[uint32]valueLogSegmentHealth, len(set.Files)) + for id, f := range set.Files { + if f == nil { + continue + } + h := health[id] + if _, wasOld := oldValueIDs[id]; !wasOld { + if h.RewriteCount < nextRewriteCount { + h.RewriteCount = nextRewriteCount + } + } + size := fileSize(f) + h.SegmentBytes = size + h.LiveBytes = size + // Online rewrite callers pass a manager set; avoid expensive stat calls + // per segment and keep age monotonic from prior metadata timestamps. + h.AgeSeconds = advanceSegmentAgeSeconds(h, now) + h.LastUpdatedUnixNano = now.UnixNano() + out[id] = h + } + return saveValueLogHealth(path, out) + } + segments, err := listWALSegments(dbDir) if err != nil { return err diff --git a/TreeDB/db/vlog_rewrite.go b/TreeDB/db/vlog_rewrite.go index 133d1de4b..b63886625 100644 --- a/TreeDB/db/vlog_rewrite.go +++ b/TreeDB/db/vlog_rewrite.go @@ -32,6 +32,10 @@ const defaultValueLogRewriteSegmentBytes = 128 << 20 const rewriteDictMinPayloadBytes = 32 << 10 const rewriteDictBatchMaxK = 64 const rewriteReadScratchMaxCap = 1 << 20 // 1MiB cap to avoid retaining oversized decode buffers +const rewriteKeyArenaMaxCap = 1 << 20 // 1MiB cap to avoid retaining oversized key arenas + +var rewriteRIDStartScanner = nextRewriteRIDStart +var rewriteWALSegmentsLister = listWALSegments func rewriteAllowDictForSmallPayload(value []byte) bool { if len(value) < page.PageSize { @@ -706,7 +710,7 @@ func (db *DB) estimateValueLogLiveBytesBySegment(ctx context.Context) (_ map[uin var seenGroupedRecords map[groupedRecordKey]struct{} userIter := snap.tree.IteratorWithOptions(nil, nil, tree.IteratorOptions{Mode: tree.IteratorModePointerProjection}) - if err := db.collectValueLogLiveBytes(ctx, userIter, liveByID, &seenGroupedRecords); err != nil { + if err := db.collectValueLogLiveBytes(ctx, userIter, liveByID, &seenGroupedRecords, snap.state.ValueLogSet); err != nil { _ = userIter.Close() return nil, err } @@ -714,7 +718,7 @@ func (db *DB) estimateValueLogLiveBytesBySegment(ctx context.Context) (_ map[uin sysIter := tree.New(snap.idx.pager, newValueReader(snap.state.ValueLogSet), snap.state.SystemRootPageID). IteratorWithOptions(nil, nil, tree.IteratorOptions{Mode: tree.IteratorModePointerProjection}) - if err := db.collectValueLogLiveBytes(ctx, sysIter, liveByID, &seenGroupedRecords); err != nil { + if err := db.collectValueLogLiveBytes(ctx, sysIter, liveByID, &seenGroupedRecords, snap.state.ValueLogSet); err != nil { _ = sysIter.Close() return nil, err } @@ -725,10 +729,10 @@ func (db *DB) estimateValueLogLiveBytesBySegment(ctx context.Context) (_ map[uin // live-byte estimation; otherwise rewrite planning can select "stale" segments // that are actually pinned by live leaf pages. if snap.idx != nil && snap.idx.pager != nil { - if err := db.collectLeafRefValueLogLiveBytes(ctx, snap.idx.pager, snap.state.RootPageID, liveByID, &seenGroupedRecords); err != nil { + if err := db.collectLeafRefValueLogLiveBytes(ctx, snap.idx.pager, snap.state.RootPageID, liveByID, &seenGroupedRecords, snap.state.ValueLogSet); err != nil { return nil, err } - if err := db.collectLeafRefValueLogLiveBytes(ctx, snap.idx.pager, snap.state.SystemRootPageID, liveByID, &seenGroupedRecords); err != nil { + if err := db.collectLeafRefValueLogLiveBytes(ctx, snap.idx.pager, snap.state.SystemRootPageID, liveByID, &seenGroupedRecords, snap.state.ValueLogSet); err != nil { return nil, err } } @@ -743,7 +747,7 @@ type groupedRecordKey struct { start uint64 } -func (db *DB) collectValueLogLiveBytes(ctx context.Context, it iterator.UnsafeIterator, liveByID map[uint32]int64, seenGroupedRecords *map[groupedRecordKey]struct{}) error { +func (db *DB) collectValueLogLiveBytes(ctx context.Context, it iterator.UnsafeIterator, liveByID map[uint32]int64, seenGroupedRecords *map[groupedRecordKey]struct{}, set *valuelog.Set) error { for it.Valid() { if err := ctx.Err(); err != nil { return err @@ -776,7 +780,7 @@ func (db *DB) collectValueLogLiveBytes(ctx context.Context, it iterator.UnsafeIt seen[k] = struct{}{} } - recordLen, err := db.valueLogRecordLengthForRewrite(ptr) + recordLen, err := db.valueLogRecordLengthForRewriteInSet(ptr, set) if err != nil { return err } @@ -786,7 +790,7 @@ func (db *DB) collectValueLogLiveBytes(ctx context.Context, it iterator.UnsafeIt return it.Error() } -func (db *DB) collectLeafRefValueLogLiveBytes(ctx context.Context, p *pager.Pager, rootID uint64, liveByID map[uint32]int64, seenGroupedRecords *map[groupedRecordKey]struct{}) error { +func (db *DB) collectLeafRefValueLogLiveBytes(ctx context.Context, p *pager.Pager, rootID uint64, liveByID map[uint32]int64, seenGroupedRecords *map[groupedRecordKey]struct{}, set *valuelog.Set) error { if ctx == nil { ctx = context.Background() } @@ -794,7 +798,7 @@ func (db *DB) collectLeafRefValueLogLiveBytes(ctx context.Context, p *pager.Page return nil } if ptr, ok := page.DecodeLeafRef(rootID); ok { - return db.collectLeafRefPtrLiveBytes(ptr, liveByID, seenGroupedRecords) + return db.collectLeafRefPtrLiveBytes(ptr, liveByID, seenGroupedRecords, set) } stack := make([]uint64, 0, 128) stack = append(stack, rootID) @@ -835,7 +839,7 @@ func (db *DB) collectLeafRefValueLogLiveBytes(ctx context.Context, p *pager.Page return err } if ptr, ok := page.DecodeLeafRef(childID); ok { - if err := db.collectLeafRefPtrLiveBytes(ptr, liveByID, seenGroupedRecords); err != nil { + if err := db.collectLeafRefPtrLiveBytes(ptr, liveByID, seenGroupedRecords, set); err != nil { return err } continue @@ -852,7 +856,7 @@ func (db *DB) collectLeafRefValueLogLiveBytes(ctx context.Context, p *pager.Page return nil } -func (db *DB) collectLeafRefPtrLiveBytes(ptr page.ValuePtr, liveByID map[uint32]int64, seenGroupedRecords *map[groupedRecordKey]struct{}) error { +func (db *DB) collectLeafRefPtrLiveBytes(ptr page.ValuePtr, liveByID map[uint32]int64, seenGroupedRecords *map[groupedRecordKey]struct{}, set *valuelog.Set) error { if liveByID == nil { return nil } @@ -881,7 +885,7 @@ func (db *DB) collectLeafRefPtrLiveBytes(ptr page.ValuePtr, liveByID map[uint32] seen[k] = struct{}{} } - recordLen, err := db.valueLogRecordLengthForRewrite(ptr) + recordLen, err := db.valueLogRecordLengthForRewriteInSet(ptr, set) if err != nil { return err } @@ -906,6 +910,10 @@ func readValueLogRecordLengthFromHeader(r io.ReaderAt, start int64) (uint32, err } func (db *DB) valueLogRecordLengthForRewrite(ptr page.ValuePtr) (uint32, error) { + return db.valueLogRecordLengthForRewriteInSet(ptr, nil) +} + +func (db *DB) valueLogRecordLengthForRewriteInSet(ptr page.ValuePtr, set *valuelog.Set) (uint32, error) { hint := page.ValuePtrRecordLength(ptr) if !valueLogRecordLengthNeedsHeader(ptr, hint) { return hint, nil @@ -913,24 +921,31 @@ func (db *DB) valueLogRecordLengthForRewrite(ptr page.ValuePtr) (uint32, error) if ptr.Offset < 4 { return 0, fmt.Errorf("vlog-rewrite: invalid pointer offset %d", ptr.Offset) } + if set != nil { + f := set.Files[ptr.FileID] + if f != nil && f.File != nil { + start := int64(ptr.Offset - 4) + return readValueLogRecordLengthFromHeader(f.File, start) + } + } if db == nil || db.valueLogManager == nil { return 0, fmt.Errorf("vlog-rewrite: value-log manager unavailable") } - set := db.valueLogManager.CurrentSetNoRefresh() - if set == nil || set.Files[ptr.FileID] == nil { - if set != nil { - _ = db.valueLogManager.Release(set) + currentSet := db.valueLogManager.CurrentSetNoRefresh() + if currentSet == nil || currentSet.Files[ptr.FileID] == nil { + if currentSet != nil { + _ = db.valueLogManager.Release(currentSet) } if err := db.valueLogManager.Refresh(); err != nil { return 0, err } - set = db.valueLogManager.CurrentSetNoRefresh() + currentSet = db.valueLogManager.CurrentSetNoRefresh() } - if set == nil { + if currentSet == nil { return 0, fmt.Errorf("vlog-rewrite: value-log set unavailable") } - defer func() { _ = db.valueLogManager.Release(set) }() - f := set.Files[ptr.FileID] + defer func() { _ = db.valueLogManager.Release(currentSet) }() + f := currentSet.Files[ptr.FileID] if f == nil || f.File == nil { return 0, fmt.Errorf("vlog-rewrite: missing segment for pointer %s", formatValueLogPtr(ptr)) } @@ -1208,16 +1223,40 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl return stats, nil } - segments, err := listWALSegments(db.dir) - if err != nil { - return stats, err + nextRID := uint64(0) + var ( + segments []logSegment + lane uint32 + startSeq uint32 + needSegScan = true + ) + if opts.ReserveRIDs != nil && db.valueLogManager != nil { + if hintLane, hintSeq, ok := db.valueLogManager.RewriteLaneHint(); ok { + probePath := filepath.Join(db.dir, "wal", fmt.Sprintf("value-l%d-%06d.log", hintLane, hintSeq+1)) + if _, statErr := os.Stat(probePath); statErr == nil { + needSegScan = true + } else if os.IsNotExist(statErr) { + lane, startSeq = hintLane, hintSeq + needSegScan = false + } else { + return stats, statErr + } + } } - nextRID, err := nextRewriteRIDStart(segments) - if err != nil { - return stats, err + if needSegScan { + segments, err = rewriteWALSegmentsLister(db.dir) + if err != nil { + return stats, err + } + lane, startSeq = chooseRewriteLane(segments) + } + if opts.ReserveRIDs == nil { + nextRID, err = rewriteRIDStartScanner(segments) + if err != nil { + return stats, err + } } ridAlloc := newRewriteRIDAllocator(nextRID, opts.ReserveRIDs) - lane, startSeq := chooseRewriteLane(segments) maxBytes := opts.MaxSegmentBytes if maxBytes <= 0 { maxBytes = defaultValueLogRewriteSegmentBytes @@ -1237,9 +1276,14 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl batchSize := normalizeValueLogRewriteBatchSize(opts.BatchSize) swaps := make([]rewriteSwap, 0, batchSize) + batchCreatedIDs := make([]uint32, 0, 4) localityPolicy := normalizeValueLogRewriteLocalityPolicy(opts.LocalityPolicy) candidates := make([]rewriteCandidate, 0, batchSize) - var rewriteReadScratch []byte + candidateKeyArena := make([]byte, 0, 16<<10) + // Seed decode scratch so ReadUnsafeTo can immediately reuse caller-owned + // storage for grouped compressed reads instead of allocating per-record. + const rewriteReadScratchInitCap = 1024 + rewriteReadScratch := make([]byte, 0, rewriteReadScratchInitCap) var canceledErr error flushBatch := func() error { @@ -1248,11 +1292,15 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl } orderRewriteCandidates(candidates, localityPolicy) swaps = swaps[:0] + batchCreatedIDs = batchCreatedIDs[:0] startRID, err := ridAlloc.Reserve(len(candidates)) if err != nil { return err } for _, candidate := range candidates { + if rewriteReadScratch == nil { + rewriteReadScratch = make([]byte, 0, rewriteReadScratchInitCap) + } val, usedScratch, err := db.valueLogManager.ReadUnsafeTo(candidate.oldPtr, rewriteReadScratch) if err != nil { return err @@ -1274,6 +1322,16 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl stats.RecordsCopied++ stats.ValueRecordsCopied++ stats.ValueBytesCopied += int64(len(val)) + seenID := false + for _, id := range batchCreatedIDs { + if id == newPtr.FileID { + seenID = true + break + } + } + if !seenID { + batchCreatedIDs = append(batchCreatedIDs, newPtr.FileID) + } swaps = append(swaps, rewriteSwap{ key: candidate.key, oldPtr: candidate.oldPtr, @@ -1289,10 +1347,26 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl return err } } + // Register rewrite-created segments before publishing pointer swaps so + // finalizeCommit can stay on CurrentSetNoRefresh and avoid full scans. + for _, id := range batchCreatedIDs { + path := db.valueLogManager.SegmentPath(id) + if err := db.valueLogManager.RegisterSegment(path, id); err != nil { + return err + } + if err := db.valueLogManager.PromoteCurrentWritable(id); err != nil { + return err + } + } if err := db.applyRewriteSwapBatch(swaps, opts.SyncEachBatch); err != nil { return err } candidates = candidates[:0] + if cap(candidateKeyArena) > rewriteKeyArenaMaxCap { + candidateKeyArena = nil + } else { + candidateKeyArena = candidateKeyArena[:0] + } return nil } @@ -1316,7 +1390,10 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl continue } } - key := append([]byte(nil), it.UnsafeKey()...) + unsafeKey := it.UnsafeKey() + keyStart := len(candidateKeyArena) + candidateKeyArena = append(candidateKeyArena, unsafeKey...) + key := candidateKeyArena[keyStart:len(candidateKeyArena):len(candidateKeyArena)] candidates = append(candidates, rewriteCandidate{ key: key, oldPtr: oldPtr, @@ -1462,16 +1539,24 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl if err := db.publishValueLogSetNoRefresh(); err != nil { return stats, err } - if err := updateValueLogHealthAfterRewrite(db.dir, oldValueIDs); err != nil { + postSet := db.valueLogManager.CurrentSetNoRefresh() + if postSet != nil { + defer func() { _ = db.valueLogManager.Release(postSet) }() + } + if err := updateValueLogHealthAfterRewrite(db.dir, oldValueIDs, postSet); err != nil { return stats, err } - afterSegs, afterBytes, err := valueLogSegmentStats(db.dir) - if err != nil { - return stats, err + if postSet != nil { + stats.SegmentsAfter, stats.BytesAfter = valueLogSegmentStatsFromSet(postSet) + } else { + afterSegs, afterBytes, err := valueLogSegmentStats(db.dir) + if err != nil { + return stats, err + } + stats.SegmentsAfter = afterSegs + stats.BytesAfter = afterBytes } - stats.SegmentsAfter = afterSegs - stats.BytesAfter = afterBytes if canceledErr != nil { return stats, canceledErr } @@ -1482,9 +1567,11 @@ type leafRefRewriteCtx struct { ctx context.Context db *DB - pager *pager.Pager - leafReader tree.SlabReader - alloc interface { + pager *pager.Pager + leafReader tree.SlabReader + leafToer unsafeToReader + leafScratch []byte + alloc interface { Alloc(hint uint64) (uint64, error) } @@ -1501,6 +1588,29 @@ type leafRefRewriteCtx struct { copiedBytes int64 } +func (c *leafRefRewriteCtx) readLeafPage(ptr page.ValuePtr) ([]byte, error) { + if c == nil || c.leafReader == nil { + return nil, fmt.Errorf("vlog-rewrite: value-log snapshot reader unavailable") + } + if c.leafToer != nil { + if cap(c.leafScratch) < page.PageSize { + c.leafScratch = make([]byte, 0, page.PageSize) + } else { + c.leafScratch = c.leafScratch[:0] + } + leafPage, usedScratch, err := c.leafToer.ReadUnsafeTo(ptr, c.leafScratch[:0]) + if err != nil { + return nil, err + } + if usedScratch { + // Keep the caller-provided decode buffer hot across leafref rewrites. + c.leafScratch = leafPage[:0] + } + return leafPage, nil + } + return c.leafReader.ReadUnsafe(ptr) +} + func (c *leafRefRewriteCtx) rewriteNode(id uint64) (uint64, bool, error) { if c == nil { return id, false, errors.New("vlog-rewrite: nil leafref rewrite ctx") @@ -1531,7 +1641,7 @@ func (c *leafRefRewriteCtx) rewriteNode(id uint64) (uint64, bool, error) { if c.writer == nil || c.ridAlloc == nil { return id, false, fmt.Errorf("vlog-rewrite: rewrite writer unavailable") } - leafPage, err := c.leafReader.ReadUnsafe(ptr) + leafPage, err := c.readLeafPage(ptr) if err != nil { return id, false, err } @@ -1587,11 +1697,9 @@ func (c *leafRefRewriteCtx) rewriteNode(id uint64) (uint64, bool, error) { if count == 0 { return id, false, nil } - childIDs := make([]uint64, int(count)) - keys := make([][]byte, int(count)) - changed := false + var childIDs []uint64 for i := uint16(0); i < count; i++ { - keyView, childID, err := n.GetInternalEntryView(i) + _, childID, err := n.GetInternalEntryView(i) if err != nil { return id, false, err } @@ -1599,13 +1707,21 @@ func (c *leafRefRewriteCtx) rewriteNode(id uint64) (uint64, bool, error) { if err != nil { return id, false, err } - if childChanged { - changed = true + if childChanged && childIDs == nil { + childIDs = make([]uint64, int(count)) + for j := uint16(0); j < i; j++ { + _, prevChild, err := n.GetInternalEntryView(j) + if err != nil { + return id, false, err + } + childIDs[int(j)] = prevChild + } + } + if childIDs != nil { + childIDs[int(i)] = nextChild } - childIDs[int(i)] = nextChild - keys[int(i)] = append([]byte(nil), keyView...) } - if !changed { + if childIDs == nil { return id, false, nil } if c.alloc == nil { @@ -1628,8 +1744,12 @@ func (c *leafRefRewriteCtx) rewriteNode(id uint64) (uint64, bool, error) { } else if ok { b.SetInternalFenceBounds(low, high) } - for i := range childIDs { - if err := b.AddInternalChild(keys[i], childIDs[i]); err != nil { + for i := uint16(0); i < count; i++ { + keyView, _, err := n.GetInternalEntryView(i) + if err != nil { + return id, false, err + } + if err := b.AddInternalChild(keyView, childIDs[int(i)]); err != nil { return id, false, err } } @@ -1721,6 +1841,10 @@ func (db *DB) rewriteLeafRefsOnline(ctx context.Context, writer *rewriteWriter, ridAlloc: ridAlloc, sourceIDs: sourceIDs, } + if toer, ok := leafCtx.leafReader.(unsafeToReader); ok { + leafCtx.leafToer = toer + leafCtx.leafScratch = make([]byte, 0, page.PageSize) + } newSysRoot, sysChanged, err := leafCtx.rewriteNode(sysRoot) if err != nil { @@ -1745,8 +1869,23 @@ func (db *DB) rewriteLeafRefsOnline(ctx context.Context, writer *rewriteWriter, return 0, 0, err } } + createdIDs, err := writer.createdFileIDs() + if err != nil { + return 0, 0, err + } + if len(createdIDs) > 0 { + // Register rewrite-created segments before commit publication so + // finalizeCommit can publish CurrentSetNoRefresh without forcing a + // filesystem rescan in leafref-heavy rewrite paths. + for _, id := range createdIDs { + path := db.valueLogManager.SegmentPath(id) + if err := db.valueLogManager.RegisterSegment(path, id); err != nil { + return 0, 0, err + } + } + } - if err := db.finalizeCommit(newRoot, newSysRoot, leafCtx.retired, sync, adaptive.Metrics{}, nil, db.indexOuterLeavesInValueLog, nil); err != nil { + if err := db.finalizeCommit(newRoot, newSysRoot, leafCtx.retired, sync, adaptive.Metrics{}, createdIDs, false, nil); err != nil { return 0, 0, err } tracker = nil @@ -1754,12 +1893,13 @@ func (db *DB) rewriteLeafRefsOnline(ctx context.Context, writer *rewriteWriter, } func nextRewriteRIDStart(segments []logSegment) (uint64, error) { + const ridScanReaderBufferSize = 64 << 10 maxRID := uint64(0) for _, segment := range segments { if !segment.valueLog { continue } - reader, err := valuelog.NewReader(segment.path, segment.fileID) + reader, err := valuelog.NewReaderWithBufferSize(segment.path, segment.fileID, ridScanReaderBufferSize) if err != nil { if errors.Is(err, os.ErrNotExist) { continue @@ -1837,20 +1977,8 @@ func (db *DB) applyRewriteSwapBatchOptimistic(swaps []rewriteSwap, sync bool) (b defer batch.Release(b) b.Reserve(len(swaps)) - for _, swap := range swaps { - entry, err := tr.GetEntry(swap.key) - if err != nil { - if errors.Is(err, tree.ErrKeyNotFound) { - continue - } - return false, err - } - if entry.Flags&node.FlagPointer == 0 || entry.ValuePtr != swap.oldPtr { - continue - } - if err := b.SetPointerView(swap.key, swap.newPtr); err != nil { - return false, err - } + if err := collectRewriteSwapPointerMatches(tr, b, swaps); err != nil { + return false, err } entries := b.SortedEntries() @@ -1936,20 +2064,8 @@ func (db *DB) applyRewriteSwapBatchSerialized(swaps []rewriteSwap, sync bool) er defer batch.Release(b) b.Reserve(len(swaps)) - for _, swap := range swaps { - entry, err := tr.GetEntry(swap.key) - if err != nil { - if errors.Is(err, tree.ErrKeyNotFound) { - continue - } - return err - } - if entry.Flags&node.FlagPointer == 0 || entry.ValuePtr != swap.oldPtr { - continue - } - if err := b.SetPointerView(swap.key, swap.newPtr); err != nil { - return err - } + if err := collectRewriteSwapPointerMatches(tr, b, swaps); err != nil { + return err } entries := b.SortedEntries() @@ -1976,6 +2092,42 @@ func (db *DB) applyRewriteSwapBatchSerialized(swaps []rewriteSwap, sync bool) er return nil } +func collectRewriteSwapPointerMatches(tr *tree.Tree, b *batch.Batch, swaps []rewriteSwap) error { + if tr == nil || b == nil || len(swaps) == 0 { + return nil + } + // Sort in-place to avoid per-batch swap-slice copies on rewrite hot paths. + sort.Slice(swaps, func(i, j int) bool { + return bytes.Compare(swaps[i].key, swaps[j].key) < 0 + }) + + it := tr.IteratorWithOptions(swaps[0].key, nil, tree.IteratorOptions{Mode: tree.IteratorModePointerProjection}) + defer func() { _ = it.Close() }() + + for _, swap := range swaps { + for it.Valid() { + curr := it.UnsafeKey() + cmp := bytes.Compare(curr, swap.key) + if cmp < 0 { + it.Next() + continue + } + if cmp > 0 { + break + } + _, ptr, flags := it.UnsafeEntry() + if flags&node.FlagPointer != 0 && ptr == swap.oldPtr { + if err := b.SetPointerView(swap.key, swap.newPtr); err != nil { + return err + } + } + it.Next() + break + } + } + return it.Error() +} + // ValueLogRewriteOffline rewrites value-log pointers into new segments and // swaps index.db to reference the new log. This is an offline operation // (requires exclusive lock and a clean commitlog). @@ -2044,7 +2196,7 @@ func ValueLogRewriteOffline(opts Options) (ValueLogRewriteStats, error) { stats.BytesBefore = beforeBytes lane, startSeq := chooseRewriteLane(segments) - nextRID, err := nextRewriteRIDStart(segments) + nextRID, err := rewriteRIDStartScanner(segments) if err != nil { _ = d.Close() return stats, err @@ -2234,7 +2386,7 @@ func ValueLogRewriteOffline(opts Options) (ValueLogRewriteStats, error) { if err := removeOldValueLogSegments(segments); err != nil { return stats, err } - if err := updateValueLogHealthAfterRewrite(opts.Dir, oldValueIDs); err != nil { + if err := updateValueLogHealthAfterRewrite(opts.Dir, oldValueIDs, nil); err != nil { if opts.NotifyError != nil { opts.NotifyError(fmt.Errorf("value-log health update after rewrite: %w", err)) } @@ -2257,6 +2409,10 @@ type rewriteWriter struct { start uint32 maxSize int64 nextRID uint64 + // currentPath/currentFileID cache the active writer segment identity so + // CurrentValueLogSegment can avoid per-call path/fileID recomputation. + currentPath string + currentFileID uint32 // blockCompression enables per-frame block compression for dictID=0 append // paths (used by online rewrite). Offline rewrites use AppendRawRecord and do // not consult this setting. @@ -2397,6 +2553,15 @@ func (w *rewriteWriter) AppendLeafPage(leafPage []byte) (page.ValuePtr, error) { return w.appendValue(rid, leafPage) } +// CurrentValueLogSegment reports the writer's current segment identity. +// This lets commit publication register the segment without directory scans. +func (w *rewriteWriter) CurrentValueLogSegment() (string, uint32, bool) { + if w == nil || w.currentPath == "" || w.currentFileID == 0 { + return "", 0, false + } + return w.currentPath, w.currentFileID, true +} + func (w *rewriteWriter) ensureWriter() error { if w.w != nil { return nil @@ -2425,6 +2590,8 @@ func (w *rewriteWriter) rotate() error { writer.SetKeepPolicy(w.keepIoNsPerByte, w.keepEncodeNsRaw, w.keepSafetyMargin) w.w = writer w.seq = nextSeq + w.currentPath = path + w.currentFileID = fileID return nil } if err := w.w.RotateTo(path, fileID); err != nil { @@ -2433,6 +2600,8 @@ func (w *rewriteWriter) rotate() error { w.w.SetBlockCompression(w.blockCodec, w.blockCompression) w.w.SetKeepPolicy(w.keepIoNsPerByte, w.keepEncodeNsRaw, w.keepSafetyMargin) w.seq = nextSeq + w.currentPath = path + w.currentFileID = fileID return nil } @@ -3259,14 +3428,40 @@ func valueLogSegmentStats(dir string) (count int, bytes int64, err error) { if !seg.valueLog { continue } + if seg.size > 0 { + count++ + bytes += seg.size + continue + } + if seg.size == 0 { + // Keep zero-length segments visible in stats (rare but possible for + // newly-created/truncated files). + if _, statErr := os.Stat(seg.path); statErr == nil { + count++ + } + continue + } info, statErr := os.Stat(seg.path) - if statErr != nil { + if statErr == nil { + count++ + bytes += info.Size() + } + } + return count, bytes, nil +} + +func valueLogSegmentStatsFromSet(set *valuelog.Set) (count int, bytes int64) { + if set == nil { + return 0, 0 + } + for _, f := range set.Files { + if f == nil { continue } count++ - bytes += info.Size() + bytes += fileSize(f) } - return count, bytes, nil + return count, bytes } func removeOldValueLogSegments(segments []logSegment) error { diff --git a/TreeDB/db/vlog_rewrite_bench_test.go b/TreeDB/db/vlog_rewrite_bench_test.go new file mode 100644 index 000000000..41c8a2fb3 --- /dev/null +++ b/TreeDB/db/vlog_rewrite_bench_test.go @@ -0,0 +1,424 @@ +package db + +import ( + "bytes" + "context" + "fmt" + "os" + "path/filepath" + "runtime" + "testing" + + "github.com/snissn/gomap/TreeDB/internal/valuelog" + "github.com/snissn/gomap/TreeDB/node" + "github.com/snissn/gomap/TreeDB/page" + "github.com/snissn/gomap/TreeDB/pager" +) + +func BenchmarkValueLogRewriteOnline_ValuePointers(b *testing.B) { + const ( + seg1Records = 2048 + seg2Records = 1024 + ) + + var totalCopied int64 + var totalBytes int64 + var totalRefreshScans uint64 + var totalRewriteAllocs uint64 + + for i := 0; i < b.N; i++ { + b.StopTimer() + db, sourceIDs, cleanup := setupValuePointerRewriteBench(b, seg1Records, seg2Records) + refreshBefore := db.valueLogManager.RefreshScanCount() + var memBefore runtime.MemStats + runtime.ReadMemStats(&memBefore) + b.StartTimer() + + stats, err := db.ValueLogRewriteOnline(context.Background(), ValueLogRewriteOnlineOptions{ + SourceFileIDs: sourceIDs, + BatchSize: 512, + }) + b.StopTimer() + if err != nil { + cleanup() + b.Fatalf("ValueLogRewriteOnline: %v", err) + } + totalCopied += int64(stats.ValueRecordsCopied) + totalBytes += stats.ValueBytesCopied + totalRefreshScans += db.valueLogManager.RefreshScanCount() - refreshBefore + var memAfter runtime.MemStats + runtime.ReadMemStats(&memAfter) + if memAfter.Mallocs > memBefore.Mallocs { + totalRewriteAllocs += memAfter.Mallocs - memBefore.Mallocs + } + cleanup() + } + + if b.N > 0 { + b.ReportMetric(float64(totalCopied)/float64(b.N), "value_records/op") + b.ReportMetric(float64(totalBytes)/float64(b.N), "value_bytes/op") + b.ReportMetric(float64(totalRefreshScans)/float64(b.N), "refresh_scans/op") + b.ReportMetric(float64(totalRewriteAllocs)/float64(b.N), "rewrite_allocs/op") + } +} + +func BenchmarkValueLogRewriteOnline_LeafRefs(b *testing.B) { + const keyCount = 1536 + + var totalCopied int64 + var totalBytes int64 + var totalRefreshScans uint64 + var totalRewriteAllocs uint64 + + for i := 0; i < b.N; i++ { + b.StopTimer() + db, sourceIDs, cleanup := setupLeafRefRewriteBench(b, keyCount) + refreshBefore := db.valueLogManager.RefreshScanCount() + var memBefore runtime.MemStats + runtime.ReadMemStats(&memBefore) + b.StartTimer() + + stats, err := db.ValueLogRewriteOnline(context.Background(), ValueLogRewriteOnlineOptions{ + SourceFileIDs: sourceIDs, + BatchSize: 256, + }) + b.StopTimer() + if err != nil { + cleanup() + b.Fatalf("ValueLogRewriteOnline: %v", err) + } + totalCopied += int64(stats.LeafRefRecordsCopied) + totalBytes += stats.LeafRefBytesCopied + totalRefreshScans += db.valueLogManager.RefreshScanCount() - refreshBefore + var memAfter runtime.MemStats + runtime.ReadMemStats(&memAfter) + if memAfter.Mallocs > memBefore.Mallocs { + totalRewriteAllocs += memAfter.Mallocs - memBefore.Mallocs + } + cleanup() + } + + if b.N > 0 { + b.ReportMetric(float64(totalCopied)/float64(b.N), "leafref_records/op") + b.ReportMetric(float64(totalBytes)/float64(b.N), "leafref_bytes/op") + b.ReportMetric(float64(totalRefreshScans)/float64(b.N), "refresh_scans/op") + b.ReportMetric(float64(totalRewriteAllocs)/float64(b.N), "rewrite_allocs/op") + } +} + +func BenchmarkValueLogRewriteOnline_LeafRefs_ReserveRIDs(b *testing.B) { + const keyCount = 1536 + + var totalCopied int64 + var totalBytes int64 + var totalRefreshScans uint64 + var totalRewriteAllocs uint64 + + for i := 0; i < b.N; i++ { + b.StopTimer() + db, sourceIDs, cleanup := setupLeafRefRewriteBench(b, keyCount) + refreshBefore := db.valueLogManager.RefreshScanCount() + nextRID := uint64(1 << 42) + var memBefore runtime.MemStats + runtime.ReadMemStats(&memBefore) + b.StartTimer() + + stats, err := db.ValueLogRewriteOnline(context.Background(), ValueLogRewriteOnlineOptions{ + SourceFileIDs: sourceIDs, + BatchSize: 256, + ReserveRIDs: func(count int) (uint64, error) { + if count <= 0 { + return 0, fmt.Errorf("invalid count %d", count) + } + start := nextRID + nextRID += uint64(count) + return start, nil + }, + }) + b.StopTimer() + if err != nil { + cleanup() + b.Fatalf("ValueLogRewriteOnline: %v", err) + } + totalCopied += int64(stats.LeafRefRecordsCopied) + totalBytes += stats.LeafRefBytesCopied + totalRefreshScans += db.valueLogManager.RefreshScanCount() - refreshBefore + var memAfter runtime.MemStats + runtime.ReadMemStats(&memAfter) + if memAfter.Mallocs > memBefore.Mallocs { + totalRewriteAllocs += memAfter.Mallocs - memBefore.Mallocs + } + cleanup() + } + + if b.N > 0 { + b.ReportMetric(float64(totalCopied)/float64(b.N), "leafref_records/op") + b.ReportMetric(float64(totalBytes)/float64(b.N), "leafref_bytes/op") + b.ReportMetric(float64(totalRefreshScans)/float64(b.N), "refresh_scans/op") + b.ReportMetric(float64(totalRewriteAllocs)/float64(b.N), "rewrite_allocs/op") + } +} + +func setupValuePointerRewriteBench(tb testing.TB, seg1Records, seg2Records int) (*DB, []uint32, func()) { + tb.Helper() + dir, err := os.MkdirTemp("", "treedb-vlog-rewrite-value-bench-*") + if err != nil { + tb.Fatalf("MkdirTemp: %v", err) + } + + db, err := Open(Options{ + Dir: dir, + Durability: DurabilityWALOffRelaxed, + DisableBackgroundPrune: true, + LeafPrefixCompression: true, + IndexColumnarLeaves: true, + IndexPackedValuePtr: true, + ValueLog: ValueLogOptions{ + ForcePointers: true, + }, + }) + if err != nil { + _ = os.RemoveAll(dir) + tb.Fatalf("Open: %v", err) + } + + ptrs1 := appendPointersInNewSegmentBench(tb, dir, 0, 1, 1_000_000, seg1Records, func(i int) []byte { + return bytes.Repeat([]byte{byte(i % 251)}, 768) + }) + ptrs2 := appendPointersInNewSegmentBench(tb, dir, 0, 2, 2_000_000, seg2Records, func(i int) []byte { + return bytes.Repeat([]byte{byte((i + 7) % 251)}, 768) + }) + + bt, ok := db.NewBatch().(*Batch) + if !ok { + _ = db.Close() + _ = os.RemoveAll(dir) + tb.Fatalf("NewBatch type assertion failed") + } + // Keep only a subset of segment-1 pointers live so rewrite has deterministic + // stale bytes in the selected source. + for i := range ptrs1 { + if i%4 != 0 { + continue + } + if err := bt.SetPointer([]byte(fmt.Sprintf("s1-live-%06d", i)), ptrs1[i]); err != nil { + _ = bt.Close() + _ = db.Close() + _ = os.RemoveAll(dir) + tb.Fatalf("SetPointer(s1): %v", err) + } + } + for i := range ptrs2 { + if err := bt.SetPointer([]byte(fmt.Sprintf("s2-live-%06d", i)), ptrs2[i]); err != nil { + _ = bt.Close() + _ = db.Close() + _ = os.RemoveAll(dir) + tb.Fatalf("SetPointer(s2): %v", err) + } + } + if err := bt.Write(); err != nil { + _ = bt.Close() + _ = db.Close() + _ = os.RemoveAll(dir) + tb.Fatalf("seed Write: %v", err) + } + _ = bt.Close() + + if err := db.RefreshValueLogSet(); err != nil { + _ = db.Close() + _ = os.RemoveAll(dir) + tb.Fatalf("RefreshValueLogSet: %v", err) + } + + sourceIDs := []uint32{ptrs1[0].FileID} + cleanup := func() { + _ = db.Close() + _ = os.RemoveAll(dir) + } + return db, sourceIDs, cleanup +} + +func setupLeafRefRewriteBench(tb testing.TB, keyCount int) (*DB, []uint32, func()) { + tb.Helper() + dir, err := os.MkdirTemp("", "treedb-vlog-rewrite-leaf-bench-*") + if err != nil { + tb.Fatalf("MkdirTemp: %v", err) + } + + db, err := Open(Options{ + Dir: dir, + Durability: DurabilityWALOffRelaxed, + DisableBackgroundPrune: true, + IndexOuterLeavesInValueLog: true, + LeafPrefixCompression: true, + IndexColumnarLeaves: true, + IndexPackedValuePtr: true, + }) + if err != nil { + _ = os.RemoveAll(dir) + tb.Fatalf("Open: %v", err) + } + + walDir := filepath.Join(dir, "wal") + if err := os.MkdirAll(walDir, 0o755); err != nil { + _ = db.Close() + _ = os.RemoveAll(dir) + tb.Fatalf("mkdir wal: %v", err) + } + + leafLog := newRewriteWriter(walDir, 0, 0, 16<<10) + leafLog.blockCompression = false + leafLog.blockCodec = valuelog.BlockCodecSnappy + db.SetLeafPageLog(leafLog) + + value := bytes.Repeat([]byte("leafref-bench-value-"), 3) + bt, ok := db.NewBatch().(*Batch) + if !ok { + _ = leafLog.Close() + _ = db.Close() + _ = os.RemoveAll(dir) + tb.Fatalf("NewBatch type assertion failed") + } + for i := 0; i < keyCount; i++ { + key := []byte(fmt.Sprintf("k%06d", i)) + if err := bt.Set(key, value); err != nil { + _ = bt.Close() + _ = leafLog.Close() + _ = db.Close() + _ = os.RemoveAll(dir) + tb.Fatalf("Batch.Set(%q): %v", key, err) + } + } + if err := bt.Write(); err != nil { + _ = bt.Close() + _ = leafLog.Close() + _ = db.Close() + _ = os.RemoveAll(dir) + tb.Fatalf("seed Write: %v", err) + } + _ = bt.Close() + if err := db.RefreshValueLogSet(); err != nil { + _ = leafLog.Close() + _ = db.Close() + _ = os.RemoveAll(dir) + tb.Fatalf("RefreshValueLogSet: %v", err) + } + state := db.State() + if state == nil { + _ = leafLog.Close() + _ = db.Close() + _ = os.RemoveAll(dir) + tb.Fatalf("missing DB state") + } + + counts := make(map[uint32]int, 16) + collectLeafRefFileCountsBench(tb, db.Pager(), state.RootPageID, counts) + collectLeafRefFileCountsBench(tb, db.Pager(), state.SystemRootPageID, counts) + if len(counts) == 0 { + _ = leafLog.Close() + _ = db.Close() + _ = os.RemoveAll(dir) + tb.Fatalf("no leafref source file IDs found") + } + + targetID := uint32(0) + targetCount := -1 + for fileID, n := range counts { + if n > targetCount || (n == targetCount && fileID < targetID) { + targetID = fileID + targetCount = n + } + } + + sourceIDs := []uint32{targetID} + cleanup := func() { + _ = leafLog.Close() + _ = db.Close() + _ = os.RemoveAll(dir) + } + return db, sourceIDs, cleanup +} + +func collectLeafRefFileCountsBench(tb testing.TB, p *pager.Pager, rootID uint64, counts map[uint32]int) { + tb.Helper() + if p == nil || rootID == 0 || counts == nil { + return + } + if ptr, ok := page.DecodeLeafRef(rootID); ok { + counts[ptr.FileID]++ + return + } + + stack := make([]uint64, 0, 128) + stack = append(stack, rootID) + visited := make(map[uint64]struct{}, 256) + + for len(stack) > 0 { + pageID := stack[len(stack)-1] + stack = stack[:len(stack)-1] + if _, ok := visited[pageID]; ok { + continue + } + visited[pageID] = struct{}{} + + if ptr, ok := page.DecodeLeafRef(pageID); ok { + counts[ptr.FileID]++ + continue + } + + data, err := p.Get(pageID) + if err != nil { + tb.Fatalf("pager.Get(%d): %v", pageID, err) + } + n := node.NewNodeView(data) + if !n.VerifyChecksum() { + tb.Fatalf("checksum mismatch on page %d", pageID) + } + + switch n.Type() { + case page.PageTypeLeaf: + // no children + case page.PageTypeInternal: + count := n.Count() + for i := uint16(0); i < count; i++ { + childID, err := n.GetInternalChildID(i) + if err != nil { + tb.Fatalf("GetInternalChildID(%d,%d): %v", pageID, i, err) + } + stack = append(stack, childID) + } + default: + tb.Fatalf("unexpected page type %d at page %d", n.Type(), pageID) + } + } +} + +func appendPointersInNewSegmentBench(tb testing.TB, dir string, lane, seq uint32, ridBase uint64, n int, valueAt func(i int) []byte) []page.ValuePtr { + tb.Helper() + walDir := filepath.Join(dir, "wal") + if err := os.MkdirAll(walDir, 0o755); err != nil { + tb.Fatalf("mkdir wal: %v", err) + } + fileID, err := valuelog.EncodeFileID(lane, seq) + if err != nil { + tb.Fatalf("encode file id lane=%d seq=%d: %v", lane, seq, err) + } + path := filepath.Join(walDir, fmt.Sprintf("value-l%d-%06d.log", lane, seq)) + w, err := valuelog.NewWriter(path, fileID) + if err != nil { + tb.Fatalf("new writer: %v", err) + } + ptrs := make([]page.ValuePtr, 0, n) + for i := 0; i < n; i++ { + ptr, err := w.Append(0, nil, ridBase+uint64(i), valueAt(i)) + if err != nil { + _ = w.Close() + tb.Fatalf("append rid=%d: %v", ridBase+uint64(i), err) + } + ptrs = append(ptrs, ptr) + } + if err := w.Close(); err != nil { + tb.Fatalf("close writer: %v", err) + } + return ptrs +} From 7973722c3cccec2a61139e921ef8e4f685253d8d Mon Sep 17 00:00:00 2001 From: Mikers Date: Sun, 29 Mar 2026 08:23:19 -1000 Subject: [PATCH 56/61] treedb: derive rewrite ref deltas from matched swaps --- TreeDB/db/vlog_rewrite.go | 45 +++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/TreeDB/db/vlog_rewrite.go b/TreeDB/db/vlog_rewrite.go index b63886625..09dd0b049 100644 --- a/TreeDB/db/vlog_rewrite.go +++ b/TreeDB/db/vlog_rewrite.go @@ -1977,7 +1977,8 @@ func (db *DB) applyRewriteSwapBatchOptimistic(swaps []rewriteSwap, sync bool) (b defer batch.Release(b) b.Reserve(len(swaps)) - if err := collectRewriteSwapPointerMatches(tr, b, swaps); err != nil { + rewriteDelta, err := collectRewriteSwapPointerMatches(tr, b, swaps) + if err != nil { return false, err } @@ -1998,13 +1999,9 @@ func (db *DB) applyRewriteSwapBatchOptimistic(swaps []rewriteSwap, sync bool) (b return false, err } entries = b.SortedEntries() - vlogRefDelta, err := db.buildValueLogRefDelta(idx.pager, rootID, baseSeq, entries) - if err != nil { - freeErr := tracker.FreeAll() - if freeErr != nil { - return false, errors.Join(err, freeErr) - } - return false, err + var vlogRefDelta *valueLogRefDelta + if db.valueLogRefTracker != nil && db.valueLogRefTracker.canTrack(baseSeq) && !db.indexOuterLeavesInValueLog { + vlogRefDelta = rewriteDelta } db.commitMu.Lock() @@ -2064,7 +2061,8 @@ func (db *DB) applyRewriteSwapBatchSerialized(swaps []rewriteSwap, sync bool) er defer batch.Release(b) b.Reserve(len(swaps)) - if err := collectRewriteSwapPointerMatches(tr, b, swaps); err != nil { + rewriteDelta, err := collectRewriteSwapPointerMatches(tr, b, swaps) + if err != nil { return err } @@ -2079,9 +2077,9 @@ func (db *DB) applyRewriteSwapBatchSerialized(swaps []rewriteSwap, sync bool) er return err } entries = b.SortedEntries() - vlogRefDelta, err := db.buildValueLogRefDelta(idx.pager, rootID, baseSeq, entries) - if err != nil { - return err + var vlogRefDelta *valueLogRefDelta + if db.valueLogRefTracker != nil && db.valueLogRefTracker.canTrack(baseSeq) && !db.indexOuterLeavesInValueLog { + vlogRefDelta = rewriteDelta } if err := db.finalizeCommit(newRoot, sysRoot, retired, sync, metrics, touchedValueLogSegments, db.indexOuterLeavesInValueLog, vlogRefDelta); err != nil { return err @@ -2092,9 +2090,9 @@ func (db *DB) applyRewriteSwapBatchSerialized(swaps []rewriteSwap, sync bool) er return nil } -func collectRewriteSwapPointerMatches(tr *tree.Tree, b *batch.Batch, swaps []rewriteSwap) error { +func collectRewriteSwapPointerMatches(tr *tree.Tree, b *batch.Batch, swaps []rewriteSwap) (*valueLogRefDelta, error) { if tr == nil || b == nil || len(swaps) == 0 { - return nil + return nil, nil } // Sort in-place to avoid per-batch swap-slice copies on rewrite hot paths. sort.Slice(swaps, func(i, j int) bool { @@ -2103,6 +2101,7 @@ func collectRewriteSwapPointerMatches(tr *tree.Tree, b *batch.Batch, swaps []rew it := tr.IteratorWithOptions(swaps[0].key, nil, tree.IteratorOptions{Mode: tree.IteratorModePointerProjection}) defer func() { _ = it.Close() }() + var delta *valueLogRefDelta for _, swap := range swaps { for it.Valid() { @@ -2118,14 +2117,28 @@ func collectRewriteSwapPointerMatches(tr *tree.Tree, b *batch.Batch, swaps []rew _, ptr, flags := it.UnsafeEntry() if flags&node.FlagPointer != 0 && ptr == swap.oldPtr { if err := b.SetPointerView(swap.key, swap.newPtr); err != nil { - return err + return nil, err + } + if page.IsValueLogFileID(swap.oldPtr.FileID) || page.IsValueLogFileID(swap.newPtr.FileID) { + if delta == nil { + delta = newValueLogRefDelta() + } + if page.IsValueLogFileID(swap.oldPtr.FileID) { + delta.add(swap.oldPtr.FileID, -1) + } + if page.IsValueLogFileID(swap.newPtr.FileID) { + delta.add(swap.newPtr.FileID, 1) + } } } it.Next() break } } - return it.Error() + if err := it.Error(); err != nil { + return nil, err + } + return delta, nil } // ValueLogRewriteOffline rewrites value-log pointers into new segments and From 0f4039c2ab7548e62f75d2f79989c106b448f0ac Mon Sep 17 00:00:00 2001 From: Mikers Date: Sun, 29 Mar 2026 08:54:45 -1000 Subject: [PATCH 57/61] bench: add block-normalized metrics to run_celestia AB harness --- docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md | 3 + docs/benchmarks/CELESTIA_COMPRESSION_LOOP.md | 13 +++ scripts/run_celestia_ab.sh | 95 ++++++++++++++++++++ 3 files changed, 111 insertions(+) diff --git a/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md b/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md index 648120a26..1b4625bc2 100644 --- a/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md +++ b/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md @@ -94,6 +94,7 @@ Default pair metric focus: - `S_post_wal`: WAL bytes after offline rewrite - `T_total = T_sync + T_rw` - `max_rss_kb` (memory guardrail) +- `blocks_synced` and normalized metrics (`*_per_block`) to de-noise moving-target runs Outputs: - `artifacts/celestia_ab//runs.csv` @@ -103,6 +104,8 @@ Outputs: The harness alternates run order per pair (`control->candidate`, then `candidate->control`) and can stop early on clear win/loss signals. +For stable pair scoring, prefer `FREEZE_REMOTE_HEIGHT_AT_START=1` and validate +`delta_blocks_synced` stays near zero across pairs. ## Experimental Knob - `TREEDB_ENABLE_VLOG_GENERATION_PRECHECKPOINT_REWRITE=1` diff --git a/docs/benchmarks/CELESTIA_COMPRESSION_LOOP.md b/docs/benchmarks/CELESTIA_COMPRESSION_LOOP.md index 7b0836451..d092f4bbd 100644 --- a/docs/benchmarks/CELESTIA_COMPRESSION_LOOP.md +++ b/docs/benchmarks/CELESTIA_COMPRESSION_LOOP.md @@ -67,6 +67,19 @@ Outputs: - `pairs.csv` - per-run `run.json` +Signal hygiene additions in `run_celestia` A/B artifacts: +- `runs.csv` now includes `blocks_synced` plus normalized metrics: + - `s_sync_app_bytes_per_block` + - `s_post_app_bytes_per_block` + - `t_sync_seconds_per_block` + - `t_total_seconds_per_block` +- `pairs.csv` now includes: + - `delta_blocks_synced` + - `delta_s_sync_app_bytes_per_block` + - `delta_t_total_seconds_per_block` +- `summary.md` includes `pairs with block-count drift` so moving-target runs + are visible before making a promote/reject decision. + ## Stage 2: Pprof/Implementation Efficiency Pass Run this stage before full `run_celestia` if fast gate shows: diff --git a/scripts/run_celestia_ab.sh b/scripts/run_celestia_ab.sh index ef8c3f088..4b271caed 100755 --- a/scripts/run_celestia_ab.sh +++ b/scripts/run_celestia_ab.sh @@ -281,6 +281,16 @@ def safe_int(raw: str | None, default: int = 0) -> int: except Exception: return default +def safe_div(num, den): + try: + if den is None or float(den) == 0: + return None + if num is None: + return None + return float(num) / float(den) + except Exception: + return None + def probe_sync_progress(node_log_path: Path | None) -> dict[str, object]: progress = { "node_log_present": False, @@ -347,6 +357,22 @@ if not resolved_invalid_reason and rewrite_attempted == 1 and rewrite_rc != 0: resolved_invalid_reason = "rewrite_failed" valid = resolved_invalid_reason == "" t_total = (t_sync + t_rw) if valid else None +trust_height = safe_int(sync.get("trust_height"), 0) +stop_at_local_height = safe_int(sync.get("stop_at_local_height"), 0) +final_local_height = safe_int(sync.get("final_local_height"), 0) +final_remote_height = safe_int(sync.get("final_remote_height"), 0) +final_remote_height_actual = safe_int(sync.get("final_remote_height_actual"), 0) +freeze_remote_height_at_start = safe_int(sync.get("freeze_remote_height_at_start"), 0) +blocks_synced = 0 +if trust_height > 0 and final_local_height >= trust_height: + blocks_synced = final_local_height - trust_height +remote_minus_stop_height = None +if stop_at_local_height > 0 and final_remote_height > 0: + remote_minus_stop_height = final_remote_height - stop_at_local_height +s_sync_app_bytes_per_block = safe_div(pre_app_bytes, blocks_synced) +s_post_app_bytes_per_block = safe_div(post_app_bytes, blocks_synced) +t_sync_seconds_per_block = safe_div(t_sync, blocks_synced) +t_total_seconds_per_block = safe_div(t_total, blocks_synced) if t_total is not None else None result = { "pair_index": pair_index, @@ -366,6 +392,14 @@ result = { "duration_seconds": t_sync, "max_rss_kb": safe_int(sync.get("max_rss_kb"), 0), "max_hwm_kb": safe_int(sync.get("max_hwm_kb"), 0), + "freeze_remote_height_at_start": freeze_remote_height_at_start, + "trust_height": trust_height, + "stop_at_local_height": stop_at_local_height, + "final_local_height": final_local_height, + "final_remote_height": final_remote_height, + "final_remote_height_actual": final_remote_height_actual, + "blocks_synced": blocks_synced, + "remote_minus_stop_height": remote_minus_stop_height, "end_app_bytes": safe_int(sync.get("end_app_bytes"), pre_app_bytes), "end_data_bytes": safe_int(sync.get("end_data_bytes"), 0), "end_home_bytes": safe_int(sync.get("end_home_bytes"), 0), @@ -389,7 +423,12 @@ result = { "s_sync_wal_bytes": pre_wal_bytes, "s_post_app_bytes": post_app_bytes, "s_post_wal_bytes": post_wal_bytes, + "s_sync_app_bytes_per_block": s_sync_app_bytes_per_block, + "s_post_app_bytes_per_block": s_post_app_bytes_per_block, "max_rss_kb": safe_int(sync.get("max_rss_kb"), 0), + "blocks_synced": blocks_synced, + "t_sync_seconds_per_block": t_sync_seconds_per_block, + "t_total_seconds_per_block": t_total_seconds_per_block, }, "maintenance_summary": maintenance, } @@ -483,6 +522,18 @@ with runs_csv.open("w", newline="", encoding="utf-8") as fh: "s_post_app_bytes", "s_post_wal_bytes", "max_rss_kb", + "blocks_synced", + "trust_height", + "stop_at_local_height", + "final_local_height", + "final_remote_height", + "final_remote_height_actual", + "freeze_remote_height_at_start", + "remote_minus_stop_height", + "s_sync_app_bytes_per_block", + "s_post_app_bytes_per_block", + "t_sync_seconds_per_block", + "t_total_seconds_per_block", "valid", "invalid_reason", "run_exit_code", @@ -499,6 +550,7 @@ with runs_csv.open("w", newline="", encoding="utf-8") as fh: s = r.get("sizes", {}) or {} rw = r.get("rewrite", {}) or {} summary = r.get("maintenance_summary", {}) or {} + sync = r.get("sync", {}) or {} valid = run_is_valid(r) w.writerow([ int(r.get("pair_index", 0)), @@ -512,6 +564,18 @@ with runs_csv.open("w", newline="", encoding="utf-8") as fh: s.get("post_app_bytes"), s.get("post_wal_bytes"), m.get("max_rss_kb"), + m.get("blocks_synced"), + sync.get("trust_height"), + sync.get("stop_at_local_height"), + sync.get("final_local_height"), + sync.get("final_remote_height"), + sync.get("final_remote_height_actual"), + sync.get("freeze_remote_height_at_start"), + sync.get("remote_minus_stop_height"), + m.get("s_sync_app_bytes_per_block"), + m.get("s_post_app_bytes_per_block"), + m.get("t_sync_seconds_per_block"), + m.get("t_total_seconds_per_block"), valid, run_invalid_reason(r), run_exit_code(r), @@ -553,6 +617,9 @@ for pair in sorted(by_pair): "delta_t_total_seconds": None, "delta_s_sync_app_bytes": None, "delta_s_post_wal_bytes": None, + "delta_blocks_synced": None, + "delta_s_sync_app_bytes_per_block": None, + "delta_t_total_seconds_per_block": None, "control_valid": ctrl_valid, "candidate_valid": cand_valid, "control_invalid_reason": ctrl_reason, @@ -570,6 +637,12 @@ for pair in sorted(by_pair): base_sync = bm.get("t_sync_seconds") cand_sync_app = cm.get("s_sync_app_bytes") base_sync_app = bm.get("s_sync_app_bytes") + cand_blocks = cm.get("blocks_synced") + base_blocks = bm.get("blocks_synced") + cand_sync_app_per_block = cm.get("s_sync_app_bytes_per_block") + base_sync_app_per_block = bm.get("s_sync_app_bytes_per_block") + cand_total_per_block = cm.get("t_total_seconds_per_block") + base_total_per_block = bm.get("t_total_seconds_per_block") def delta(a, b): if a is None or b is None: @@ -580,6 +653,9 @@ for pair in sorted(by_pair): d_sync = delta(cand_sync, base_sync) d_post_wal = delta(cand_post_wal, base_post_wal) d_sync_app = delta(cand_sync_app, base_sync_app) + d_blocks = delta(cand_blocks, base_blocks) + d_sync_app_per_block = delta(cand_sync_app_per_block, base_sync_app_per_block) + d_total_per_block = delta(cand_total_per_block, base_total_per_block) outcome = "neutral" if d_post_wal is not None and d_total is not None: @@ -598,6 +674,9 @@ for pair in sorted(by_pair): "delta_t_total_seconds": d_total, "delta_s_sync_app_bytes": d_sync_app, "delta_s_post_wal_bytes": d_post_wal, + "delta_blocks_synced": d_blocks, + "delta_s_sync_app_bytes_per_block": d_sync_app_per_block, + "delta_t_total_seconds_per_block": d_total_per_block, "control_valid": ctrl_valid, "candidate_valid": cand_valid, "control_invalid_reason": ctrl_reason, @@ -614,6 +693,9 @@ with pairs_csv.open("w", newline="", encoding="utf-8") as fh: "delta_t_total_seconds", "delta_s_sync_app_bytes", "delta_s_post_wal_bytes", + "delta_blocks_synced", + "delta_s_sync_app_bytes_per_block", + "delta_t_total_seconds_per_block", "control_valid", "candidate_valid", "control_invalid_reason", @@ -627,6 +709,9 @@ with pairs_csv.open("w", newline="", encoding="utf-8") as fh: r["delta_t_total_seconds"], r["delta_s_sync_app_bytes"], r["delta_s_post_wal_bytes"], + r["delta_blocks_synced"], + r["delta_s_sync_app_bytes_per_block"], + r["delta_t_total_seconds_per_block"], r["control_valid"], r["candidate_valid"], r["control_invalid_reason"], @@ -637,6 +722,11 @@ with pairs_csv.open("w", newline="", encoding="utf-8") as fh: scored_rows = [row for row in pair_rows if row.get("outcome") != "invalid"] completed_pairs = len(scored_rows) neutral = max(0, completed_pairs - wins - losses) +nonzero_block_drift_pairs = 0 +for row in scored_rows: + d = row.get("delta_blocks_synced") + if d is not None and d != 0: + nonzero_block_drift_pairs += 1 neutral_streak = 0 for row in reversed(scored_rows): if row.get("outcome") == "neutral": @@ -687,6 +777,7 @@ lines.append(f"- observed pairs: `{raw_pairs}`") lines.append(f"- scored pairs: `{completed_pairs}`") lines.append(f"- invalid pairs skipped: `{invalid_pairs}`") lines.append(f"- wins/losses/neutral: `{wins}` / `{losses}` / `{neutral}`") +lines.append(f"- pairs with block-count drift: `{nonzero_block_drift_pairs}`") lines.append(f"- neutral streak (tail): `{neutral_streak}`") lines.append(f"- invalid streak (tail): `{invalid_streak}`") lines.append(f"- size tolerance bytes: `{size_tol}`") @@ -713,6 +804,9 @@ if pair_rows: lines.append(f"- delta_t_total_seconds: `{last['delta_t_total_seconds']}`") lines.append(f"- delta_s_sync_app_bytes: `{last['delta_s_sync_app_bytes']}`") lines.append(f"- delta_s_post_wal_bytes: `{last['delta_s_post_wal_bytes']}`") + lines.append(f"- delta_blocks_synced: `{last['delta_blocks_synced']}`") + lines.append(f"- delta_s_sync_app_bytes_per_block: `{last['delta_s_sync_app_bytes_per_block']}`") + lines.append(f"- delta_t_total_seconds_per_block: `{last['delta_t_total_seconds_per_block']}`") summary_md.write_text("\n".join(lines) + "\n", encoding="utf-8") payload = { @@ -722,6 +816,7 @@ payload = { "wins": wins, "losses": losses, "neutral": neutral, + "nonzero_block_drift_pairs": nonzero_block_drift_pairs, "neutral_streak": neutral_streak, "invalid_streak": invalid_streak, "stop": stop, From 45ec9cdb07de21c8e98b8239765ab3dc6e6e9470 Mon Sep 17 00:00:00 2001 From: Mikers Date: Sun, 29 Mar 2026 10:01:17 -1000 Subject: [PATCH 58/61] docs: add zero-local fast-fail guidance for celestia AB --- docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md b/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md index 1b4625bc2..93123d95a 100644 --- a/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md +++ b/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md @@ -86,6 +86,13 @@ CLEAR_LOSS_PAIRS=3 \ ./scripts/run_celestia_ab.sh ``` +Recommended for probe loops (faster fail on low-signal state-sync stalls): +- Set `FREEZE_REMOTE_HEIGHT_AT_START=1` in both env files so pair targets are stable. +- Set `ZERO_LOCAL_FAIL_SECONDS=` (for example `120` to `300`) to abort runs that + stay at `local=0` too long even if restore I/O is active. +- Keep `NO_PROGRESS_FAIL_SECONDS`/`NO_PROGRESS_HARD_FAIL_SECONDS` as a secondary + backstop for non-zero-local stalls. + Default pair metric focus: - `T_sync`: sync duration (seconds) - `S_sync_app`: app dir bytes at sync end From 1dfa257361a26220286df480253b128d3ffb2cd7 Mon Sep 17 00:00:00 2001 From: Mikers Date: Sun, 29 Mar 2026 11:58:59 -1000 Subject: [PATCH 59/61] db: refresh value-log set for outer-leaf commit publication --- TreeDB/db/db.go | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/TreeDB/db/db.go b/TreeDB/db/db.go index d7b2f7c05..a9ed96475 100644 --- a/TreeDB/db/db.go +++ b/TreeDB/db/db.go @@ -1587,8 +1587,8 @@ func (db *DB) finalizeCommitLocked(newRootID uint64, sysRootID uint64, retired [ post.oldState = db.state.Load() var valueLogSet *valuelog.Set if db.valueLogManager != nil { - needRefresh := forceValueLogRefresh - if !needRefresh && len(touchedValueLogSegments) > 0 { + needRefresh := false + if len(touchedValueLogSegments) > 0 { for _, id := range touchedValueLogSegments { if !db.valueLogManager.HasSegment(id) { needRefresh = true @@ -1596,6 +1596,13 @@ func (db *DB) finalizeCommitLocked(newRootID uint64, sysRootID uint64, retired [ } } } + if forceValueLogRefresh { + // Outer-leaf commits can rotate multiple value-log segments within a + // single commit. Registering only the current segment can miss + // intermediate referenced segments, so force a full refresh to keep the + // published ValueLogSet complete for snapshot readers. + needRefresh = true + } if needRefresh { if err := db.valueLogManager.Refresh(); err != nil { db.mu.Unlock() From 98c43ef8c94c6d14f8135b21b991c56466367265 Mon Sep 17 00:00:00 2001 From: Mikers Date: Sun, 29 Mar 2026 12:51:38 -1000 Subject: [PATCH 60/61] vlog-rewrite: fast-path explicit sources and stabilize AB runs --- TreeDB/db/vlog_rewrite.go | 39 ++++++++++++++++++++++++++++++++++---- scripts/run_celestia_ab.sh | 18 ++++++++++++++++++ 2 files changed, 53 insertions(+), 4 deletions(-) diff --git a/TreeDB/db/vlog_rewrite.go b/TreeDB/db/vlog_rewrite.go index 09dd0b049..bafe6066c 100644 --- a/TreeDB/db/vlog_rewrite.go +++ b/TreeDB/db/vlog_rewrite.go @@ -431,6 +431,32 @@ func hasRewriteSourceSelection(opts ValueLogRewriteOnlineOptions) bool { return false } +func hasOnlyExplicitRewriteSources(opts ValueLogRewriteOnlineOptions) bool { + return len(opts.SourceFileIDs) > 0 && + opts.MaxSourceSegments <= 0 && + opts.MaxSourceBytes <= 0 && + opts.MinSegmentStaleRatio <= 0 && + opts.MinSegmentStaleBytes <= 0 && + opts.MinSegmentAge <= 0 +} + +func selectExplicitRewriteSourceIDs(sourceFileIDs []uint32, files map[uint32]*valuelog.File) map[uint32]struct{} { + if len(sourceFileIDs) == 0 || len(files) == 0 { + return nil + } + selected := make(map[uint32]struct{}, len(sourceFileIDs)) + for _, id := range sourceFileIDs { + if _, ok := files[id]; !ok { + continue + } + selected[id] = struct{}{} + } + if len(selected) == 0 { + return nil + } + return selected +} + func rewritePlanNeedsLiveEstimate(opts ValueLogRewriteOnlineOptions) bool { if !hasRewriteSourceSelection(opts) { return false @@ -491,8 +517,6 @@ func (db *DB) ValueLogRewritePlan(ctx context.Context, opts ValueLogRewriteOnlin plan.BytesTotal += fileSize(f) } - active := currentValueLogIDs(set) - var liveByID map[uint32]int64 var err error // Without selection knobs, the plan is just the global totals and should not @@ -508,7 +532,10 @@ func (db *DB) ValueLogRewritePlan(ctx context.Context, opts ValueLogRewriteOnlin sourceIDs := map[uint32]struct{}(nil) var selectionStats rewriteSourceSelectionStats - if hasRewriteSourceSelection(opts) { + if hasOnlyExplicitRewriteSources(opts) { + sourceIDs = selectExplicitRewriteSourceIDs(opts.SourceFileIDs, set.Files) + } else if hasRewriteSourceSelection(opts) { + active := currentValueLogIDs(set) sourceIDs, selectionStats = selectRewriteSourceSegmentsWithStats(opts, set.Files, active, liveByID) } plan.AgeBlockedSegments = selectionStats.ageBlockedSegments @@ -1201,7 +1228,11 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl sourceIDs map[uint32]struct{} restrictSource bool ) - if hasRewriteSourceSelection(opts) { + if hasOnlyExplicitRewriteSources(opts) { + sourceIDs = selectExplicitRewriteSourceIDs(opts.SourceFileIDs, set.Files) + restrictSource = true + stats.SourceSegmentsRequested = len(sourceIDs) + } else if hasRewriteSourceSelection(opts) { active := currentValueLogIDs(set) var liveByID map[uint32]int64 if rewritePlanNeedsLiveEstimate(opts) { diff --git a/scripts/run_celestia_ab.sh b/scripts/run_celestia_ab.sh index 4b271caed..8e1c06cb4 100755 --- a/scripts/run_celestia_ab.sh +++ b/scripts/run_celestia_ab.sh @@ -23,6 +23,11 @@ RUN_TIMEOUT_SECONDS="${RUN_TIMEOUT_SECONDS:-1800}" RUN_MAX_ATTEMPTS_PER_VARIANT="${RUN_MAX_ATTEMPTS_PER_VARIANT:-2}" RUN_RETRY_SLEEP_SECONDS="${RUN_RETRY_SLEEP_SECONDS:-20}" INVALID_PAIR_STREAK_STOP="${INVALID_PAIR_STREAK_STOP:-2}" +AB_DISABLE_HEAVY_DIAGNOSTICS="${AB_DISABLE_HEAVY_DIAGNOSTICS:-1}" +AB_CAPTURE_HEAP_ON_MAX_RSS="${AB_CAPTURE_HEAP_ON_MAX_RSS:-0}" +AB_CAPTURE_PPROF_ON_STUCK="${AB_CAPTURE_PPROF_ON_STUCK:-0}" +AB_CAPTURE_FULL_SMAPS_ON_MAX_RSS="${AB_CAPTURE_FULL_SMAPS_ON_MAX_RSS:-0}" +AB_CAPTURE_DEBUG_VARS_ON_MAX_RSS="${AB_CAPTURE_DEBUG_VARS_ON_MAX_RSS:-0}" TS="$(date +%Y%m%d%H%M%S)" OUT="${OUT_DIR:-$ROOT/artifacts/celestia_ab/$TS}" @@ -79,6 +84,11 @@ run_timeout_seconds=$RUN_TIMEOUT_SECONDS run_max_attempts_per_variant=$RUN_MAX_ATTEMPTS_PER_VARIANT run_retry_sleep_seconds=$RUN_RETRY_SLEEP_SECONDS invalid_pair_streak_stop=$INVALID_PAIR_STREAK_STOP +ab_disable_heavy_diagnostics=$AB_DISABLE_HEAVY_DIAGNOSTICS +ab_capture_heap_on_max_rss=$AB_CAPTURE_HEAP_ON_MAX_RSS +ab_capture_pprof_on_stuck=$AB_CAPTURE_PPROF_ON_STUCK +ab_capture_full_smaps_on_max_rss=$AB_CAPTURE_FULL_SMAPS_ON_MAX_RSS +ab_capture_debug_vars_on_max_rss=$AB_CAPTURE_DEBUG_VARS_ON_MAX_RSS META list_run_homes() { @@ -164,6 +174,14 @@ run_variant() { source "$env_file" set +a fi + if [[ "$AB_DISABLE_HEAVY_DIAGNOSTICS" == "1" ]]; then + # A/B runs prioritize stable wall-time+size measurements. Heavy + # diagnostics can dominate runtime and produce invalid comparisons. + export CAPTURE_HEAP_ON_MAX_RSS="${CAPTURE_HEAP_ON_MAX_RSS:-$AB_CAPTURE_HEAP_ON_MAX_RSS}" + export CAPTURE_PPROF_ON_STUCK="${CAPTURE_PPROF_ON_STUCK:-$AB_CAPTURE_PPROF_ON_STUCK}" + export CAPTURE_FULL_SMAPS_ON_MAX_RSS="${CAPTURE_FULL_SMAPS_ON_MAX_RSS:-$AB_CAPTURE_FULL_SMAPS_ON_MAX_RSS}" + export CAPTURE_DEBUG_VARS_ON_MAX_RSS="${CAPTURE_DEBUG_VARS_ON_MAX_RSS:-$AB_CAPTURE_DEBUG_VARS_ON_MAX_RSS}" + fi # Non-login shell avoids user profile side effects (e.g. tty-dependent exports) # that can fail under nohup/background runs. if [[ "$RUN_TIMEOUT_SECONDS" -gt 0 ]] && command -v timeout >/dev/null 2>&1; then From 1c5c909d955d66ce0efbd9f9c5bc29fb7e38dab9 Mon Sep 17 00:00:00 2001 From: Mikers Date: Sun, 29 Mar 2026 13:09:55 -1000 Subject: [PATCH 61/61] vlog-rewrite: trim source/zombie alloc churn in online rewrite --- TreeDB/db/vlog_rewrite.go | 144 ++++++++++++++++++++++++++------------ 1 file changed, 99 insertions(+), 45 deletions(-) diff --git a/TreeDB/db/vlog_rewrite.go b/TreeDB/db/vlog_rewrite.go index bafe6066c..24510285b 100644 --- a/TreeDB/db/vlog_rewrite.go +++ b/TreeDB/db/vlog_rewrite.go @@ -457,6 +457,17 @@ func selectExplicitRewriteSourceIDs(sourceFileIDs []uint32, files map[uint32]*va return selected } +func selectSingleExplicitRewriteSourceID(sourceFileIDs []uint32, files map[uint32]*valuelog.File) (uint32, bool) { + if len(sourceFileIDs) != 1 || len(files) == 0 { + return 0, false + } + id := sourceFileIDs[0] + if _, ok := files[id]; !ok { + return 0, false + } + return id, true +} + func rewritePlanNeedsLiveEstimate(opts ValueLogRewriteOnlineOptions) bool { if !hasRewriteSourceSelection(opts) { return false @@ -1225,13 +1236,26 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl stats.BytesBefore += fileSize(set.Files[id]) } var ( - sourceIDs map[uint32]struct{} - restrictSource bool + sourceIDs map[uint32]struct{} + singleSourceID uint32 + restrictSource bool + restrictSingleID bool + sourceSegmentCount int ) if hasOnlyExplicitRewriteSources(opts) { - sourceIDs = selectExplicitRewriteSourceIDs(opts.SourceFileIDs, set.Files) + if id, ok := selectSingleExplicitRewriteSourceID(opts.SourceFileIDs, set.Files); ok { + singleSourceID = id + restrictSingleID = true + } else { + sourceIDs = selectExplicitRewriteSourceIDs(opts.SourceFileIDs, set.Files) + } restrictSource = true - stats.SourceSegmentsRequested = len(sourceIDs) + if restrictSingleID { + sourceSegmentCount = 1 + } else { + sourceSegmentCount = len(sourceIDs) + } + stats.SourceSegmentsRequested = sourceSegmentCount } else if hasRewriteSourceSelection(opts) { active := currentValueLogIDs(set) var liveByID map[uint32]int64 @@ -1244,10 +1268,11 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl } sourceIDs, _ = selectRewriteSourceSegmentsWithStats(opts, set.Files, active, liveByID) restrictSource = true - stats.SourceSegmentsRequested = len(sourceIDs) + sourceSegmentCount = len(sourceIDs) + stats.SourceSegmentsRequested = sourceSegmentCount } _ = db.valueLogManager.Release(set) - if restrictSource && len(sourceIDs) == 0 { + if restrictSource && sourceSegmentCount == 0 { // No source segments selected: this rewrite pass is a no-op. stats.SegmentsAfter = stats.SegmentsBefore stats.BytesAfter = stats.BytesBefore @@ -1417,8 +1442,14 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl continue } if restrictSource { - if _, ok := sourceIDs[oldPtr.FileID]; !ok { - continue + if restrictSingleID { + if oldPtr.FileID != singleSourceID { + continue + } + } else { + if _, ok := sourceIDs[oldPtr.FileID]; !ok { + continue + } } } unsafeKey := it.UnsafeKey() @@ -1451,8 +1482,8 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl // by LeafRef pointers even if all key/value pointers are rewritten. Move // referenced leaf pages out of the selected source segments so cleanup can // actually reclaim space. - if restrictSource && db.indexOuterLeavesInValueLog && len(sourceIDs) > 0 { - copied, copiedBytes, err := db.rewriteLeafRefsOnline(ctx, writer, ridAlloc, sourceIDs, opts.SyncEachBatch) + if restrictSource && db.indexOuterLeavesInValueLog && sourceSegmentCount > 0 { + copied, copiedBytes, err := db.rewriteLeafRefsOnline(ctx, writer, ridAlloc, sourceIDs, singleSourceID, restrictSingleID, opts.SyncEachBatch) if err != nil { return stats, err } @@ -1492,15 +1523,25 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl if err != nil { return stats, err } - if len(sourceIDs) > 0 { - stillReferenced := 0 - for id := range sourceIDs { - if _, ok := referencedAfter[id]; ok { - stillReferenced++ + if sourceSegmentCount > 0 { + if restrictSingleID { + if _, ok := referencedAfter[singleSourceID]; ok { + stats.SourceSegmentsStillReferenced = 1 + stats.SourceSegmentsUnreferenced = 0 + } else { + stats.SourceSegmentsStillReferenced = 0 + stats.SourceSegmentsUnreferenced = 1 } + } else { + stillReferenced := 0 + for id := range sourceIDs { + if _, ok := referencedAfter[id]; ok { + stillReferenced++ + } + } + stats.SourceSegmentsStillReferenced = stillReferenced + stats.SourceSegmentsUnreferenced = len(sourceIDs) - stillReferenced } - stats.SourceSegmentsStillReferenced = stillReferenced - stats.SourceSegmentsUnreferenced = len(sourceIDs) - stillReferenced } var protectedPaths map[string]struct{} allowActiveSkip := len(opts.ProtectedPaths) > 0 @@ -1538,32 +1579,37 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl } _ = db.valueLogManager.Release(currentSet) } - zombieCandidates := make(map[uint32]struct{}, len(oldValueIDs)+len(newValueIDs)) - for id := range oldValueIDs { - zombieCandidates[id] = struct{}{} - } - for _, id := range newValueIDs { - zombieCandidates[id] = struct{}{} - } - for id := range zombieCandidates { + markZombieCandidate := func(id uint32, existedBefore bool) error { if _, ok := referencedAfter[id]; ok { - continue + return nil } if _, ok := protectedIDs[id]; ok { - continue + return nil } // Never mark currently-active pre-existing segments zombie when callers // provide ProtectedPaths (cached-mode maintenance). Concurrent writers may // still be appending records whose pointers are not yet visible in the // backend index. - if allowActiveSkip { + if allowActiveSkip && existedBefore { if _, ok := activeIDs[id]; ok { - if _, existed := oldValueIDs[id]; existed { - continue - } + return nil } } if err := db.valueLogManager.MarkZombie(id); err != nil { + return err + } + return nil + } + for id := range oldValueIDs { + if err := markZombieCandidate(id, true); err != nil { + return stats, err + } + } + for _, id := range newValueIDs { + if _, existed := oldValueIDs[id]; existed { + continue + } + if err := markZombieCandidate(id, false); err != nil { return stats, err } } @@ -1609,7 +1655,9 @@ type leafRefRewriteCtx struct { writer *rewriteWriter ridAlloc *rewriteRIDAllocator - sourceIDs map[uint32]struct{} + sourceIDs map[uint32]struct{} + singleSourceID uint32 + hasSingleID bool leafMap map[uint64]uint64 // old leafref id -> new leafref id internalMap map[uint64]uint64 // old internal page id -> new page id @@ -1661,7 +1709,11 @@ func (c *leafRefRewriteCtx) rewriteNode(id uint64) (uint64, bool, error) { return mapped, mapped != id, nil } } - if c.sourceIDs != nil { + if c.hasSingleID { + if ptr.FileID != c.singleSourceID { + return id, false, nil + } + } else if c.sourceIDs != nil { if _, ok := c.sourceIDs[ptr.FileID]; !ok { return id, false, nil } @@ -1804,7 +1856,7 @@ func (c *leafRefRewriteCtx) rewriteNode(id uint64) (uint64, bool, error) { } } -func (db *DB) rewriteLeafRefsOnline(ctx context.Context, writer *rewriteWriter, ridAlloc *rewriteRIDAllocator, sourceIDs map[uint32]struct{}, sync bool) (copied int, copiedBytes int64, err error) { +func (db *DB) rewriteLeafRefsOnline(ctx context.Context, writer *rewriteWriter, ridAlloc *rewriteRIDAllocator, sourceIDs map[uint32]struct{}, singleSourceID uint32, hasSingleSourceID bool, sync bool) (copied int, copiedBytes int64, err error) { if db == nil { return 0, 0, fmt.Errorf("missing db") } @@ -1820,9 +1872,9 @@ func (db *DB) rewriteLeafRefsOnline(ctx context.Context, writer *rewriteWriter, if writer == nil || ridAlloc == nil { return 0, 0, fmt.Errorf("vlog-rewrite: missing writer/rid state") } - // Treat nil sourceIDs as "all sources" and an empty, non-nil map as "no - // sources". The latter means there is nothing to rewrite. - if sourceIDs != nil && len(sourceIDs) == 0 { + // Treat nil sourceIDs (with no single-source constraint) as "all sources" + // and an empty, non-nil map as "no sources". + if !hasSingleSourceID && sourceIDs != nil && len(sourceIDs) == 0 { return 0, 0, nil } if ctx == nil { @@ -1863,14 +1915,16 @@ func (db *DB) rewriteLeafRefsOnline(ctx context.Context, writer *rewriteWriter, }() leafCtx := &leafRefRewriteCtx{ - ctx: ctx, - db: db, - pager: idx.pager, - leafReader: &snap.reader, - alloc: tracker, - writer: writer, - ridAlloc: ridAlloc, - sourceIDs: sourceIDs, + ctx: ctx, + db: db, + pager: idx.pager, + leafReader: &snap.reader, + alloc: tracker, + writer: writer, + ridAlloc: ridAlloc, + sourceIDs: sourceIDs, + singleSourceID: singleSourceID, + hasSingleID: hasSingleSourceID, } if toer, ok := leafCtx.leafReader.(unsafeToReader); ok { leafCtx.leafToer = toer