diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go index b38354bec..ec960e5bd 100644 --- a/TreeDB/caching/db.go +++ b/TreeDB/caching/db.go @@ -1860,6 +1860,15 @@ const ( envDisableVlogGenerationVacuum = "TREEDB_DISABLE_VLOG_GENERATION_VACUUM" envDisableVlogGenerationLoop = "TREEDB_DISABLE_VLOG_GENERATION_LOOP" envDisableVlogGenerationCheckpointKick = "TREEDB_DISABLE_VLOG_GENERATION_CHECKPOINT_KICK" + // Experimental WAL-off checkpoint-kick guard: when enabled, avoid starting + // fresh rewrite planning during hot foreground activity. Queued rewrite debt + // (or deferred maintenance due) remains eligible so resumable progress is not + // starved. + envEnableVlogGenerationCheckpointKickHotDebtOnly = "TREEDB_ENABLE_VLOG_GENERATION_CHECKPOINT_KICK_HOT_DEBT_ONLY" + // Experimental WAL-off override: allow rewrite planning/execution before the + // first explicit checkpoint. Disabled by default because it can add restore + // contention during early state-sync. + envEnableVlogGenerationPreCheckpointRewrite = "TREEDB_ENABLE_VLOG_GENERATION_PRECHECKPOINT_REWRITE" // Diagnostic toggle for WAL-off checkpoint-time sparse-index vacuum. envDisableCheckpointAutoVacuum = "TREEDB_DISABLE_CHECKPOINT_AUTO_VACUUM" minMemtablePrealloc = 64 * 1024 @@ -3864,38 +3873,44 @@ func (db *DB) ValueLogRetainedPaths() []string { return db.valueLogRetainedPaths() } -func (db *DB) valueLogProtectedPaths() []string { - retained := db.valueLogRetainedPaths() - inUse := db.valueLogInUsePaths() - if len(retained) == 0 { - return inUse - } - if len(inUse) == 0 { - return retained - } - seen := make(map[string]struct{}, len(retained)+len(inUse)) - paths := make([]string, 0, len(retained)+len(inUse)) - for _, path := range retained { - if path == "" { - continue - } - if _, ok := seen[path]; ok { - continue +func mergeUniqueNonEmptyStrings(pathSets ...[]string) []string { + seen := make(map[string]struct{}) + var out []string + for _, paths := range pathSets { + for _, path := range paths { + if path == "" { + continue + } + if _, ok := seen[path]; ok { + continue + } + seen[path] = struct{}{} + out = append(out, path) } - seen[path] = struct{}{} - paths = append(paths, path) } - for _, path := range inUse { - if path == "" { - continue - } - if _, ok := seen[path]; ok { - continue - } - seen[path] = struct{}{} - paths = append(paths, path) + return out +} + +func (db *DB) valueLogGCProtectedPathSets() (retained []string, inUse []string, merged []string) { + retained = db.valueLogRetainedPaths() + inUse = db.valueLogInUsePaths() + merged = mergeUniqueNonEmptyStrings(retained, inUse) + return retained, inUse, merged +} + +func (db *DB) valueLogProtectedPaths() []string { + _, _, merged := db.valueLogGCProtectedPathSets() + return merged +} + +func (db *DB) valueLogGCOptions(dryRun bool) backenddb.ValueLogGCOptions { + retained, inUse, merged := db.valueLogGCProtectedPathSets() + return backenddb.ValueLogGCOptions{ + DryRun: dryRun, + ProtectedPaths: merged, + ProtectedInUsePaths: inUse, + ProtectedRetainedPaths: retained, } - return paths } // valueLogInUsePaths returns a best-effort snapshot of value-log segment paths @@ -4169,6 +4184,9 @@ func (db *DB) allowValueLogPointers() bool { if bytes >= limit { if db.valueLogHardCapWarned.CompareAndSwap(false, true) { db.reportError(fmt.Errorf("cachingdb: retained value-log bytes %d exceed hard cap %d; disabling new value-log pointers", bytes, limit)) + // Hard-cap entry means retained bytes are now constraining placement. + // Request an eager retained prune so lifecycle pins can drain promptly. + db.scheduleRetainedValueLogPruneForce() } return false } @@ -4184,13 +4202,174 @@ type valueLogSetRefresher interface { RefreshValueLogSet() error } -func (db *DB) pruneRetainedValueLogs() { - if !db.valueLogEnabled() { +type retainedValueLogPruneStats struct { + RemovedSegments int + RemovedBytes int64 + InUseSkippedSegments int + InUseSkippedBytes int64 + CandidateSegments int + CandidateBytes int64 + LiveSkippedSegments int + LiveSkippedBytes int64 + ParseSkippedSegments int + ParseSkippedBytes int64 + ZombieMarkedSegments int + ZombieMarkedBytes int64 + ObservedSourceSegments int + ObservedSourceBytes int64 + ObservedSourceCandidateSegments int + ObservedSourceCandidateBytes int64 + ObservedSourceRemovedSegments int + ObservedSourceRemovedBytes int64 + ObservedSourceInUseSkippedSegments int + ObservedSourceInUseSkippedBytes int64 + ObservedSourceLiveSkippedSegments int + ObservedSourceLiveSkippedBytes int64 + ObservedSourceParseSkippedSegments int + ObservedSourceParseSkippedBytes int64 + ObservedSourceZombieMarkedSegments int + ObservedSourceZombieMarkedBytes int64 + AbortedForegroundWrites bool + RetriedWithoutWriteGate bool + RetrySucceeded bool +} + +func (db *DB) observeRetainedValueLogPruneStats(pruneStats retainedValueLogPruneStats) { + if db == nil { return } + db.retainedValueLogPruneLastObservedSourceSegments.Store(int64(pruneStats.ObservedSourceSegments)) + db.retainedValueLogPruneLastObservedSourceBytes.Store(pruneStats.ObservedSourceBytes) + db.retainedValueLogPruneLastObservedSourceCandidateSegments.Store(int64(pruneStats.ObservedSourceCandidateSegments)) + db.retainedValueLogPruneLastObservedSourceCandidateBytes.Store(pruneStats.ObservedSourceCandidateBytes) + db.retainedValueLogPruneLastObservedSourceRemovedSegments.Store(int64(pruneStats.ObservedSourceRemovedSegments)) + db.retainedValueLogPruneLastObservedSourceRemovedBytes.Store(pruneStats.ObservedSourceRemovedBytes) + db.retainedValueLogPruneLastObservedSourceInUseSkippedSegments.Store(int64(pruneStats.ObservedSourceInUseSkippedSegments)) + db.retainedValueLogPruneLastObservedSourceInUseSkippedBytes.Store(pruneStats.ObservedSourceInUseSkippedBytes) + db.retainedValueLogPruneLastObservedSourceLiveSkippedSegments.Store(int64(pruneStats.ObservedSourceLiveSkippedSegments)) + db.retainedValueLogPruneLastObservedSourceLiveSkippedBytes.Store(pruneStats.ObservedSourceLiveSkippedBytes) + db.retainedValueLogPruneLastObservedSourceParseSkippedSegments.Store(int64(pruneStats.ObservedSourceParseSkippedSegments)) + db.retainedValueLogPruneLastObservedSourceParseSkippedBytes.Store(pruneStats.ObservedSourceParseSkippedBytes) + db.retainedValueLogPruneLastObservedSourceZombieMarkedSegments.Store(int64(pruneStats.ObservedSourceZombieMarkedSegments)) + db.retainedValueLogPruneLastObservedSourceZombieMarkedBytes.Store(pruneStats.ObservedSourceZombieMarkedBytes) + if pruneStats.ObservedSourceSegments > 0 { + db.retainedValueLogPruneObservedSourceSegmentsTotal.Add(uint64(pruneStats.ObservedSourceSegments)) + } + if pruneStats.ObservedSourceBytes > 0 { + db.retainedValueLogPruneObservedSourceBytesTotal.Add(pruneStats.ObservedSourceBytes) + } + if pruneStats.ObservedSourceCandidateSegments > 0 { + db.retainedValueLogPruneObservedSourceCandidateSegmentsTotal.Add(uint64(pruneStats.ObservedSourceCandidateSegments)) + } + if pruneStats.ObservedSourceCandidateBytes > 0 { + db.retainedValueLogPruneObservedSourceCandidateBytesTotal.Add(pruneStats.ObservedSourceCandidateBytes) + } + if pruneStats.ObservedSourceRemovedSegments > 0 { + db.retainedValueLogPruneObservedSourceRemovedSegmentsTotal.Add(uint64(pruneStats.ObservedSourceRemovedSegments)) + } + if pruneStats.ObservedSourceRemovedBytes > 0 { + db.retainedValueLogPruneObservedSourceRemovedBytesTotal.Add(pruneStats.ObservedSourceRemovedBytes) + } + if pruneStats.ObservedSourceInUseSkippedSegments > 0 { + db.retainedValueLogPruneObservedSourceInUseSkippedSegmentsTotal.Add(uint64(pruneStats.ObservedSourceInUseSkippedSegments)) + } + if pruneStats.ObservedSourceInUseSkippedBytes > 0 { + db.retainedValueLogPruneObservedSourceInUseSkippedBytesTotal.Add(pruneStats.ObservedSourceInUseSkippedBytes) + } + if pruneStats.ObservedSourceLiveSkippedSegments > 0 { + db.retainedValueLogPruneObservedSourceLiveSkippedSegmentsTotal.Add(uint64(pruneStats.ObservedSourceLiveSkippedSegments)) + } + if pruneStats.ObservedSourceLiveSkippedBytes > 0 { + db.retainedValueLogPruneObservedSourceLiveSkippedBytesTotal.Add(pruneStats.ObservedSourceLiveSkippedBytes) + } + if pruneStats.ObservedSourceParseSkippedSegments > 0 { + db.retainedValueLogPruneObservedSourceParseSkippedSegmentsTotal.Add(uint64(pruneStats.ObservedSourceParseSkippedSegments)) + } + if pruneStats.ObservedSourceParseSkippedBytes > 0 { + db.retainedValueLogPruneObservedSourceParseSkippedBytesTotal.Add(pruneStats.ObservedSourceParseSkippedBytes) + } + if pruneStats.ObservedSourceZombieMarkedSegments > 0 { + db.retainedValueLogPruneObservedSourceZombieMarkedSegmentsTotal.Add(uint64(pruneStats.ObservedSourceZombieMarkedSegments)) + } + if pruneStats.ObservedSourceZombieMarkedBytes > 0 { + db.retainedValueLogPruneObservedSourceZombieMarkedBytesTotal.Add(pruneStats.ObservedSourceZombieMarkedBytes) + } + if pruneStats.RetriedWithoutWriteGate { + db.retainedValueLogPruneWriteGateRetries.Add(1) + if pruneStats.RetrySucceeded { + db.retainedValueLogPruneWriteGateRetrySuccesses.Add(1) + } + } + if pruneStats.AbortedForegroundWrites { + db.retainedValueLogPruneForegroundAbortRuns.Add(1) + } + if pruneStats.RemovedSegments > 0 { + db.retainedValueLogPruneRemovedSegments.Add(uint64(pruneStats.RemovedSegments)) + } + if pruneStats.RemovedBytes > 0 { + db.retainedValueLogPruneRemovedBytes.Add(uint64(pruneStats.RemovedBytes)) + } + if pruneStats.InUseSkippedSegments > 0 { + db.retainedValueLogPruneInUseSkippedSegments.Add(uint64(pruneStats.InUseSkippedSegments)) + } + if pruneStats.InUseSkippedBytes > 0 { + db.retainedValueLogPruneInUseSkippedBytes.Add(uint64(pruneStats.InUseSkippedBytes)) + } + if pruneStats.CandidateSegments > 0 { + db.retainedValueLogPruneCandidateSegments.Add(uint64(pruneStats.CandidateSegments)) + } + if pruneStats.CandidateBytes > 0 { + db.retainedValueLogPruneCandidateBytes.Add(uint64(pruneStats.CandidateBytes)) + } + if pruneStats.LiveSkippedSegments > 0 { + db.retainedValueLogPruneLiveSkippedSegments.Add(uint64(pruneStats.LiveSkippedSegments)) + } + if pruneStats.LiveSkippedBytes > 0 { + db.retainedValueLogPruneLiveSkippedBytes.Add(uint64(pruneStats.LiveSkippedBytes)) + } + if pruneStats.ParseSkippedSegments > 0 { + db.retainedValueLogPruneParseSkippedSegments.Add(uint64(pruneStats.ParseSkippedSegments)) + } + if pruneStats.ParseSkippedBytes > 0 { + db.retainedValueLogPruneParseSkippedBytes.Add(uint64(pruneStats.ParseSkippedBytes)) + } + if pruneStats.ZombieMarkedSegments > 0 { + db.retainedValueLogPruneZombieMarkedSegments.Add(uint64(pruneStats.ZombieMarkedSegments)) + } + if pruneStats.ZombieMarkedBytes > 0 { + db.retainedValueLogPruneZombieMarkedBytes.Add(uint64(pruneStats.ZombieMarkedBytes)) + } +} + +func (db *DB) valueLogClosedSegmentSize(path string) int64 { + if db == nil || path == "" { + return 0 + } + laneID, _, _, ok := parseLogSeq(filepath.Base(path)) + if !ok || laneID < 0 || laneID >= len(db.lanes) { + return 0 + } + l := &db.lanes[laneID] + l.vlogMu.Lock() + defer l.vlogMu.Unlock() + if l.vlogClosedSizes == nil { + return 0 + } + return l.vlogClosedSizes[path] +} + +func (db *DB) pruneRetainedValueLogs(force bool) retainedValueLogPruneStats { + return db.pruneRetainedValueLogsWithObserved(force, nil) +} + +func (db *DB) pruneRetainedValueLogsWithObserved(force bool, observedSourceIDs map[uint32]struct{}) retainedValueLogPruneStats { + var out retainedValueLogPruneStats + if !db.valueLogEnabled() { + return out + } paths := db.valueLogRetainedPaths() if len(paths) == 0 { - return + return out } inUse := make(map[string]struct{}) @@ -4198,44 +4377,119 @@ func (db *DB) pruneRetainedValueLogs() { inUse[path] = struct{}{} } - candidatePaths := make([]string, 0, len(paths)) + type pruneCandidate struct { + path string + size int64 + id uint32 + hasID bool + observed bool + } + candidatePaths := make([]pruneCandidate, 0, len(paths)) for _, path := range paths { + size := db.valueLogClosedSegmentSize(path) + candidate := pruneCandidate{path: path, size: size} + if laneID, seq, valueLog, ok := parseLogSeq(filepath.Base(path)); ok && valueLog && laneID >= 0 { + if id, err := valuelog.EncodeFileID(uint32(laneID), uint32(seq)); err == nil { + candidate.id = id + candidate.hasID = true + if _, ok := observedSourceIDs[id]; ok { + candidate.observed = true + out.ObservedSourceSegments++ + if size > 0 { + out.ObservedSourceBytes += size + } + } + } + } if _, ok := inUse[path]; ok { + out.InUseSkippedSegments++ + if size > 0 { + out.InUseSkippedBytes += size + } + if candidate.observed { + out.ObservedSourceInUseSkippedSegments++ + if size > 0 { + out.ObservedSourceInUseSkippedBytes += size + } + } continue } if db.cleanupMissingRetainedValueLog(path) { + if size > 0 { + out.RemovedSegments++ + out.RemovedBytes += size + } + if candidate.observed { + out.ObservedSourceRemovedSegments++ + if size > 0 { + out.ObservedSourceRemovedBytes += size + } + } continue } - candidatePaths = append(candidatePaths, path) + out.CandidateSegments++ + if size > 0 { + out.CandidateBytes += size + } + if candidate.observed { + out.ObservedSourceCandidateSegments++ + if size > 0 { + out.ObservedSourceCandidateBytes += size + } + } + candidatePaths = append(candidatePaths, candidate) } if len(candidatePaths) == 0 { - return + return out } live, err := db.collectValueLogLiveIDsUntil(db.lastForegroundWriteUnixNano.Load()) + if err != nil && force && errors.Is(err, errForegroundWritesResumed) { + out.RetriedWithoutWriteGate = true + live, err = db.collectValueLogLiveIDsUntil(0) + if err == nil { + out.RetrySucceeded = true + } + } if err != nil { if errors.Is(err, errForegroundWritesResumed) { - return + out.AbortedForegroundWrites = true + return out } db.reportError(fmt.Errorf("cachingdb: failed to scan value-log pointers: %w", err)) - return + return out } removed := false marked := false - for _, path := range candidatePaths { - laneID, seq, valueLog, ok := parseLogSeq(filepath.Base(path)) - if !ok || !valueLog { - continue - } - if laneID < 0 { - continue - } - id, err := valuelog.EncodeFileID(uint32(laneID), uint32(seq)) - if err != nil { + for _, candidate := range candidatePaths { + path := candidate.path + size := candidate.size + id := candidate.id + if !candidate.hasID { + out.ParseSkippedSegments++ + if size > 0 { + out.ParseSkippedBytes += size + } + if candidate.observed { + out.ObservedSourceParseSkippedSegments++ + if size > 0 { + out.ObservedSourceParseSkippedBytes += size + } + } continue } if _, ok := live[id]; ok { + out.LiveSkippedSegments++ + if size > 0 { + out.LiveSkippedBytes += size + } + if candidate.observed { + out.ObservedSourceLiveSkippedSegments++ + if size > 0 { + out.ObservedSourceLiveSkippedBytes += size + } + } continue } @@ -4246,14 +4500,44 @@ func (db *DB) pruneRetainedValueLogs() { if err := marker.MarkValueLogZombie(id); err != nil { if errors.Is(err, valuelog.ErrFileNotFound) && db.cleanupOrphanedRetainedValueLog(path) { removed = true + if size > 0 { + out.RemovedSegments++ + out.RemovedBytes += size + } + if candidate.observed { + out.ObservedSourceRemovedSegments++ + if size > 0 { + out.ObservedSourceRemovedBytes += size + } + } continue } if db.cleanupMissingRetainedValueLog(path) { + if size > 0 { + out.RemovedSegments++ + out.RemovedBytes += size + } + if candidate.observed { + out.ObservedSourceRemovedSegments++ + if size > 0 { + out.ObservedSourceRemovedBytes += size + } + } continue } db.reportError(fmt.Errorf("cachingdb: failed to mark value-log %d zombie: %w", id, err)) continue } + out.ZombieMarkedSegments++ + if size > 0 { + out.ZombieMarkedBytes += size + } + if candidate.observed { + out.ObservedSourceZombieMarkedSegments++ + if size > 0 { + out.ObservedSourceZombieMarkedBytes += size + } + } marked = true } else { db.dropValueLogSegment(path) @@ -4262,6 +4546,16 @@ func (db *DB) pruneRetainedValueLogs() { db.untrackValueLogSegmentLocked(path) db.mu.Unlock() removed = true + if size > 0 { + out.RemovedSegments++ + out.RemovedBytes += size + } + if candidate.observed { + out.ObservedSourceRemovedSegments++ + if size > 0 { + out.ObservedSourceRemovedBytes += size + } + } } db.forgetValueLogRetain(path) } @@ -4276,6 +4570,7 @@ func (db *DB) pruneRetainedValueLogs() { if removed { db.syncDirBestEffort(db.dir) } + return out } func (db *DB) retainedPrunePressureBytes() int64 { @@ -4319,6 +4614,10 @@ func (db *DB) retainedPrunePressureBytes() int64 { } func (db *DB) shouldScheduleRetainedValueLogPrune() bool { + return db.shouldScheduleRetainedValueLogPruneWithForce(false) +} + +func (db *DB) shouldScheduleRetainedValueLogPruneWithForce(force bool) bool { if db == nil || !db.valueLogEnabled() { return false } @@ -4326,22 +4625,286 @@ func (db *DB) shouldScheduleRetainedValueLogPrune() bool { if closed <= 0 { return false } + if force { + return true + } return closed >= db.retainedPrunePressureBytes() } +func (db *DB) waitForRetainedValueLogPruneQuietOrForce(quietWindow time.Duration) bool { + if db == nil { + return false + } + if quietWindow <= 0 { + return db.retainedPruneForceRequested.Swap(false) + } + ticker := time.NewTicker(foregroundMaintenancePollInterval()) + defer ticker.Stop() + for { + if db.closing.Load() { + return db.retainedPruneForceRequested.Swap(false) + } + if db.retainedPruneForceRequested.Swap(false) { + return true + } + if db.foregroundActivityQuietFor(time.Now(), quietWindow, vlogForegroundReadQuietWindow) { + return false + } + select { + case <-db.closeCh: + return db.retainedPruneForceRequested.Swap(false) + case <-ticker.C: + } + } +} + +func (db *DB) queueRetainedPruneObservedSourceIDs(ids []uint32) { + if db == nil || len(ids) == 0 { + return + } + db.retainedPruneObservedMu.Lock() + if db.retainedPruneObservedSourceIDs == nil { + db.retainedPruneObservedSourceIDs = make(map[uint32]struct{}, len(ids)) + } + for _, id := range ids { + if id == 0 { + continue + } + db.retainedPruneObservedSourceIDs[id] = struct{}{} + } + db.retainedPruneObservedMu.Unlock() +} + +func (db *DB) takeRetainedPruneObservedSourceIDs() map[uint32]struct{} { + if db == nil { + return nil + } + db.retainedPruneObservedMu.Lock() + if len(db.retainedPruneObservedSourceIDs) == 0 { + db.retainedPruneObservedMu.Unlock() + return nil + } + out := db.retainedPruneObservedSourceIDs + db.retainedPruneObservedSourceIDs = nil + db.retainedPruneObservedMu.Unlock() + return out +} + +func (db *DB) retainedPruneObservedSourcePending() bool { + if db == nil { + return false + } + db.retainedPruneObservedMu.Lock() + pending := len(db.retainedPruneObservedSourceIDs) > 0 + db.retainedPruneObservedMu.Unlock() + return pending +} + +func (db *DB) queueVlogGenerationObservedSourceGCList(ids []uint32) { + if db == nil || len(ids) == 0 { + return + } + nowUnixNano := time.Now().UnixNano() + db.vlogGenerationObservedGCMu.Lock() + if db.vlogGenerationObservedGCSourceIDs == nil { + db.vlogGenerationObservedGCSourceIDs = make(map[uint32]struct{}, len(ids)) + } + if db.vlogGenerationObservedGCFirstQueuedUnixNano == nil { + db.vlogGenerationObservedGCFirstQueuedUnixNano = make(map[uint32]int64, len(ids)) + } + added := 0 + for _, id := range ids { + if id == 0 { + continue + } + if _, exists := db.vlogGenerationObservedGCSourceIDs[id]; exists { + continue + } + db.vlogGenerationObservedGCSourceIDs[id] = struct{}{} + if _, exists := db.vlogGenerationObservedGCFirstQueuedUnixNano[id]; !exists { + db.vlogGenerationObservedGCFirstQueuedUnixNano[id] = nowUnixNano + } + added++ + } + db.vlogGenerationObservedGCMu.Unlock() + if added > 0 { + db.vlogGenerationObservedGCQueuedBatches.Add(1) + db.vlogGenerationObservedGCQueuedIDs.Add(uint64(added)) + } +} + +func (db *DB) queueVlogGenerationObservedSourceGCIDs(ids map[uint32]struct{}) { + if db == nil || len(ids) == 0 { + return + } + nowUnixNano := time.Now().UnixNano() + db.vlogGenerationObservedGCMu.Lock() + if db.vlogGenerationObservedGCSourceIDs == nil { + db.vlogGenerationObservedGCSourceIDs = make(map[uint32]struct{}, len(ids)) + } + if db.vlogGenerationObservedGCFirstQueuedUnixNano == nil { + db.vlogGenerationObservedGCFirstQueuedUnixNano = make(map[uint32]int64, len(ids)) + } + added := 0 + for id := range ids { + if id == 0 { + continue + } + if _, exists := db.vlogGenerationObservedGCSourceIDs[id]; exists { + continue + } + db.vlogGenerationObservedGCSourceIDs[id] = struct{}{} + if _, exists := db.vlogGenerationObservedGCFirstQueuedUnixNano[id]; !exists { + db.vlogGenerationObservedGCFirstQueuedUnixNano[id] = nowUnixNano + } + added++ + } + db.vlogGenerationObservedGCMu.Unlock() + if added > 0 { + db.vlogGenerationObservedGCQueuedBatches.Add(1) + db.vlogGenerationObservedGCQueuedIDs.Add(uint64(added)) + } +} + +func (db *DB) takeVlogGenerationObservedSourceGCList() []uint32 { + if db == nil { + return nil + } + db.vlogGenerationObservedGCMu.Lock() + if len(db.vlogGenerationObservedGCSourceIDs) == 0 { + db.vlogGenerationObservedGCMu.Unlock() + return nil + } + out := make([]uint32, 0, len(db.vlogGenerationObservedGCSourceIDs)) + for id := range db.vlogGenerationObservedGCSourceIDs { + if id == 0 { + continue + } + out = append(out, id) + } + db.vlogGenerationObservedGCSourceIDs = nil + db.vlogGenerationObservedGCMu.Unlock() + if len(out) > 0 { + db.vlogGenerationObservedGCTakenBatches.Add(1) + db.vlogGenerationObservedGCTakenIDs.Add(uint64(len(out))) + } + return out +} + +func (db *DB) finalizeVlogGenerationObservedSourceGCIDs(ids []uint32, dropped bool) { + if db == nil || len(ids) == 0 { + return + } + nowUnixNano := time.Now().UnixNano() + totalLatencyMS := uint64(0) + maxLatencyMS := uint64(0) + finalized := 0 + seen := make(map[uint32]struct{}, len(ids)) + db.vlogGenerationObservedGCMu.Lock() + for _, id := range ids { + if id == 0 { + continue + } + if _, exists := seen[id]; exists { + continue + } + seen[id] = struct{}{} + finalized++ + delete(db.vlogGenerationObservedGCRetryAttempts, id) + if startUnixNano, exists := db.vlogGenerationObservedGCFirstQueuedUnixNano[id]; exists { + delete(db.vlogGenerationObservedGCFirstQueuedUnixNano, id) + if startUnixNano > 0 && nowUnixNano > startUnixNano { + latencyMS := uint64((nowUnixNano - startUnixNano) / int64(time.Millisecond)) + totalLatencyMS += latencyMS + if latencyMS > maxLatencyMS { + maxLatencyMS = latencyMS + } + } + } + } + db.vlogGenerationObservedGCMu.Unlock() + if finalized == 0 { + return + } + if dropped { + db.vlogGenerationObservedGCLatencyDroppedIDs.Add(uint64(finalized)) + } else { + db.vlogGenerationObservedGCLatencyCompletedIDs.Add(uint64(finalized)) + } + if totalLatencyMS > 0 { + db.vlogGenerationObservedGCLatencyTotalMS.Add(totalLatencyMS) + updateAtomicMaxUint64(&db.vlogGenerationObservedGCLatencyMaxMS, maxLatencyMS) + } +} + +func (db *DB) retryVlogGenerationObservedSourceGCList(ids []uint32) (queuedIDs, droppedIDs int) { + if db == nil || len(ids) == 0 { + return 0, 0 + } + retry := make([]uint32, 0, len(ids)) + dropped := make([]uint32, 0, len(ids)) + seen := make(map[uint32]struct{}, len(ids)) + db.vlogGenerationObservedGCMu.Lock() + if db.vlogGenerationObservedGCRetryAttempts == nil { + db.vlogGenerationObservedGCRetryAttempts = make(map[uint32]uint8, len(ids)) + } + for _, id := range ids { + if id == 0 { + continue + } + if _, exists := seen[id]; exists { + continue + } + seen[id] = struct{}{} + attempts := db.vlogGenerationObservedGCRetryAttempts[id] + if attempts >= vlogGenerationObservedGCRetryMaxAttempts { + delete(db.vlogGenerationObservedGCRetryAttempts, id) + dropped = append(dropped, id) + continue + } + db.vlogGenerationObservedGCRetryAttempts[id] = attempts + 1 + retry = append(retry, id) + } + db.vlogGenerationObservedGCMu.Unlock() + if len(retry) > 0 { + db.vlogGenerationObservedGCRetryQueued.Add(1) + db.queueVlogGenerationObservedSourceGCList(retry) + } + if len(dropped) > 0 { + db.vlogGenerationObservedGCRetryDropped.Add(uint64(len(dropped))) + db.finalizeVlogGenerationObservedSourceGCIDs(dropped, true) + } + return len(retry), len(dropped) +} + func (db *DB) scheduleRetainedValueLogPrune() { + db.scheduleRetainedValueLogPruneWithForce(false) +} + +func (db *DB) scheduleRetainedValueLogPruneForce() { + db.scheduleRetainedValueLogPruneWithForce(true) +} + +func (db *DB) scheduleRetainedValueLogPruneWithForce(force bool) { if db == nil || !db.valueLogEnabled() { return } + db.retainedValueLogPruneScheduleRequests.Add(1) + if force { + db.retainedPruneForceRequested.Store(true) + db.retainedValueLogPruneScheduleForcedRequests.Add(1) + } if db.testSkipRetainedPrune { return } db.retainedPruneMu.Lock() if db.closing.Load() { + db.retainedValueLogPruneScheduleSkipClosing.Add(1) db.retainedPruneMu.Unlock() return } if db.retainedPruneDone != nil { + db.retainedValueLogPruneScheduleSkipInFlight.Add(1) db.retainedPruneMu.Unlock() return } @@ -4355,8 +4918,17 @@ func (db *DB) scheduleRetainedValueLogPrune() { db.retainedPruneDone = nil db.retainedPruneMu.Unlock() }() - db.waitForForegroundMaintenanceQuietWindow(retainedPruneQuietWindow) - if !db.shouldScheduleRetainedValueLogPrune() { + effectiveForce := force || db.retainedPruneForceRequested.Swap(false) + if !effectiveForce { + effectiveForce = db.waitForRetainedValueLogPruneQuietOrForce(retainedPruneQuietWindow) + } + if !db.shouldScheduleRetainedValueLogPruneWithForce(effectiveForce) { + closed := db.valueLogRetainedClosedBytes.Load() + if closed <= 0 { + db.retainedValueLogPruneScheduleSkipNoClosedBytes.Add(1) + } else if !effectiveForce && closed < db.retainedPrunePressureBytes() { + db.retainedValueLogPruneScheduleSkipBelowPressure.Add(1) + } return } // Retained prune is opportunistic reclaim; do not compete with checkpoint @@ -4371,12 +4943,30 @@ func (db *DB) scheduleRetainedValueLogPrune() { } db.checkpointMu.Unlock() now := time.Now() + minInterval := retainedPruneMinInterval + if effectiveForce && db.retainedPruneObservedSourcePending() { + minInterval = retainedPruneObservedMinInterval + } last := db.retainedPruneLastStartUnixNano.Load() - if last > 0 && now.Sub(time.Unix(0, last)) < retainedPruneMinInterval { + if last > 0 && now.Sub(time.Unix(0, last)) < minInterval { + db.retainedValueLogPruneScheduleSkipMinInterval.Add(1) return } db.retainedPruneLastStartUnixNano.Store(now.UnixNano()) - db.pruneRetainedValueLogs() + db.retainedValueLogPruneRuns.Add(1) + if effectiveForce { + db.retainedValueLogPruneForcedRuns.Add(1) + } + db.retainedValueLogPruneLastUnixNano.Store(now.UnixNano()) + observedSourceIDs := db.takeRetainedPruneObservedSourceIDs() + pruneStats := db.pruneRetainedValueLogsWithObserved(effectiveForce, observedSourceIDs) + db.observeRetainedValueLogPruneStats(pruneStats) + if len(observedSourceIDs) > 0 && (pruneStats.ObservedSourceZombieMarkedSegments > 0 || pruneStats.ObservedSourceRemovedSegments > 0) { + // When a retained prune processes rewrite-observed source segments, + // queue a near-term maintenance pass so GC can re-check reclaim state. + db.queueVlogGenerationObservedSourceGCIDs(observedSourceIDs) + db.vlogGenerationCheckpointKickPending.Store(true) + } }() } @@ -4753,6 +5343,11 @@ type Options struct { ValueLogRewriteTriggerTotalBytes int64 // ValueLogRewriteTriggerChurnPerSec triggers rewrite by churn rate. ValueLogRewriteTriggerChurnPerSec int64 + // ValueLogRewriteMinSegmentAge gates online rewrite to source segments that + // are at least this old. + // + // 0 uses the implementation default. + ValueLogRewriteMinSegmentAge time.Duration // ForceValueLogPointers stores all values out-of-line in the value log. ForceValueLogPointers bool // DisableReadChecksum skips CRC verification on value-log reads. @@ -4999,6 +5594,7 @@ type DB struct { valueLogRewriteTriggerRatioPPM uint32 valueLogRewriteTriggerBytes int64 valueLogRewriteTriggerChurn int64 + valueLogRewriteMinSegmentAge time.Duration valueLogReader *valuelog.Manager valueLogHotLanes []int valueLogWarmLanes []int @@ -5142,97 +5738,287 @@ type DB struct { valueLogMaxSegmentBytes int64 journalCompression bool - disableJournal bool - relaxedSync bool - notifyError func(error) - debugFlushPointers bool - debugFlushTiming bool - debugPtrEligible atomic.Int64 - debugPtrUsed atomic.Int64 - debugPtrNoPtr atomic.Int64 - debugPtrDenied atomic.Int64 - debugPtrDisabled atomic.Int64 - checkpointRuns atomic.Uint64 - checkpointTotalNs atomic.Uint64 - checkpointMaxNs atomic.Uint64 - checkpointNoopSkips atomic.Uint64 - checkpointFlushMuWaitNs atomic.Uint64 - checkpointFlushMuWaitMaxNs atomic.Uint64 - checkpointAutoVacuumRuns atomic.Uint64 - checkpointAutoVacuumLastCheckRun atomic.Uint64 - checkpointAutoVacuumLastPages atomic.Uint64 - checkpointAutoVacuumLastInternalP50 atomic.Uint64 - checkpointAutoVacuumLastInternalAvg atomic.Uint64 - lastForegroundWriteUnixNano atomic.Int64 - lastForegroundReadUnixNano atomic.Int64 - foregroundReadStampCounter atomic.Uint32 - activeForegroundIterators atomic.Int64 - retainedPruneLastStartUnixNano atomic.Int64 - retainedPruneMu sync.Mutex - retainedPruneDone chan struct{} - vlogGenerationRemapSuccesses atomic.Uint64 - vlogGenerationRemapFailures atomic.Uint64 - vlogGenerationRewriteBytesIn atomic.Uint64 - vlogGenerationRewriteBytesOut atomic.Uint64 - vlogGenerationRewriteRuns atomic.Uint64 - vlogGenerationRewritePlanRuns atomic.Uint64 - vlogGenerationRewritePlanCanceled atomic.Uint64 - vlogGenerationRewritePlanErrors atomic.Uint64 - vlogGenerationRewritePlanEmpty atomic.Uint64 - vlogGenerationRewritePlanSelected atomic.Uint64 - vlogGenerationRewritePlanCanceledLastNS atomic.Int64 - vlogGenerationRewriteAgeBlockedUntilNS atomic.Int64 - vlogGenerationRewriteAgeBlockedWakeRunning atomic.Bool - vlogGenerationRewriteIneffectiveLastNS atomic.Int64 - vlogGenerationRewriteIneffectiveRuns atomic.Uint64 - vlogGenerationRewriteIneffectiveBytesIn atomic.Uint64 - vlogGenerationRewriteIneffectiveBytesOut atomic.Uint64 - vlogGenerationRewriteCanceledRuns atomic.Uint64 - vlogGenerationRewriteCanceledLastNS atomic.Int64 - vlogGenerationRewriteQueuePruneRuns atomic.Uint64 - vlogGenerationRewriteQueuePruneIDs atomic.Uint64 - vlogGenerationGCSegmentsDeleted atomic.Uint64 - vlogGenerationGCBytesDeleted atomic.Uint64 - vlogGenerationGCRuns atomic.Uint64 - vlogGenerationVacuumRuns atomic.Uint64 - vlogGenerationVacuumFailures atomic.Uint64 - vlogGenerationLastVacuumUnixNano atomic.Int64 - vlogGenerationLastRewritePlanUnixNano atomic.Int64 - vlogGenerationLastRewriteUnixNano atomic.Int64 - vlogGenerationLastGCUnixNano atomic.Int64 - vlogGenerationLastCheckpointKickUnixNano atomic.Int64 - vlogGenerationLastGCDryRunUnixNano atomic.Int64 - vlogGenerationLastGCDryRunBytesEligible atomic.Int64 - vlogGenerationLastGCDryRunSegsEligible atomic.Int64 - vlogGenerationChurnBytes atomic.Uint64 - vlogGenerationSchedulerState atomic.Uint32 - vlogGenerationMaintenanceActive atomic.Bool - vlogGenerationLastReason atomic.Uint32 - vlogGenerationCheckpointKickRuns atomic.Uint64 - vlogGenerationCheckpointKickRewriteRuns atomic.Uint64 - vlogGenerationCheckpointKickGCRuns atomic.Uint64 - vlogGenerationCheckpointKickPending atomic.Bool - vlogGenerationDeferredMaintenancePending atomic.Bool - vlogGenerationDeferredMaintenanceRunning atomic.Bool - vlogGenerationRewriteStageWakeObservedNS atomic.Int64 - vlogGenerationRewriteQueueMu sync.Mutex - vlogGenerationCheckpointKickActive atomic.Bool - vlogGenerationRewriteQueue []uint32 - vlogGenerationRewriteLedger []backenddb.ValueLogRewritePlanSegment - vlogGenerationRewritePenalties map[uint32]valueLogGenerationRewritePenalty - vlogGenerationRewriteStagePending bool - vlogGenerationRewriteStageObservedUnixNano int64 - vlogGenerationRewriteQueueLoaded bool - vlogGenerationLastChurnBps atomic.Int64 - vlogGenerationLastChurnSampleBytes atomic.Uint64 - vlogGenerationLastChurnSampleNS atomic.Int64 + disableJournal bool + relaxedSync bool + notifyError func(error) + debugFlushPointers bool + debugFlushTiming bool + debugPtrEligible atomic.Int64 + debugPtrUsed atomic.Int64 + debugPtrNoPtr atomic.Int64 + debugPtrDenied atomic.Int64 + debugPtrDisabled atomic.Int64 + checkpointRuns atomic.Uint64 + checkpointTotalNs atomic.Uint64 + checkpointMaxNs atomic.Uint64 + checkpointNoopSkips atomic.Uint64 + checkpointFlushMuWaitNs atomic.Uint64 + checkpointFlushMuWaitMaxNs atomic.Uint64 + checkpointAutoVacuumRuns atomic.Uint64 + checkpointAutoVacuumLastCheckRun atomic.Uint64 + checkpointAutoVacuumLastPages atomic.Uint64 + checkpointAutoVacuumLastInternalP50 atomic.Uint64 + checkpointAutoVacuumLastInternalAvg atomic.Uint64 + lastForegroundWriteUnixNano atomic.Int64 + lastForegroundReadUnixNano atomic.Int64 + foregroundReadStampCounter atomic.Uint32 + activeForegroundIterators atomic.Int64 + retainedPruneLastStartUnixNano atomic.Int64 + retainedValueLogPruneLastUnixNano atomic.Int64 + retainedValueLogPruneRuns atomic.Uint64 + retainedValueLogPruneForcedRuns atomic.Uint64 + retainedValueLogPruneForegroundAbortRuns atomic.Uint64 + retainedValueLogPruneRemovedSegments atomic.Uint64 + retainedValueLogPruneRemovedBytes atomic.Uint64 + retainedValueLogPruneInUseSkippedSegments atomic.Uint64 + retainedValueLogPruneInUseSkippedBytes atomic.Uint64 + retainedValueLogPruneCandidateSegments atomic.Uint64 + retainedValueLogPruneCandidateBytes atomic.Uint64 + retainedValueLogPruneLiveSkippedSegments atomic.Uint64 + retainedValueLogPruneLiveSkippedBytes atomic.Uint64 + retainedValueLogPruneParseSkippedSegments atomic.Uint64 + retainedValueLogPruneParseSkippedBytes atomic.Uint64 + retainedValueLogPruneZombieMarkedSegments atomic.Uint64 + retainedValueLogPruneZombieMarkedBytes atomic.Uint64 + retainedValueLogPruneLastObservedSourceSegments atomic.Int64 + retainedValueLogPruneLastObservedSourceBytes atomic.Int64 + retainedValueLogPruneLastObservedSourceCandidateSegments atomic.Int64 + retainedValueLogPruneLastObservedSourceCandidateBytes atomic.Int64 + retainedValueLogPruneLastObservedSourceRemovedSegments atomic.Int64 + retainedValueLogPruneLastObservedSourceRemovedBytes atomic.Int64 + retainedValueLogPruneLastObservedSourceInUseSkippedSegments atomic.Int64 + retainedValueLogPruneLastObservedSourceInUseSkippedBytes atomic.Int64 + retainedValueLogPruneLastObservedSourceLiveSkippedSegments atomic.Int64 + retainedValueLogPruneLastObservedSourceLiveSkippedBytes atomic.Int64 + retainedValueLogPruneLastObservedSourceParseSkippedSegments atomic.Int64 + retainedValueLogPruneLastObservedSourceParseSkippedBytes atomic.Int64 + retainedValueLogPruneLastObservedSourceZombieMarkedSegments atomic.Int64 + retainedValueLogPruneLastObservedSourceZombieMarkedBytes atomic.Int64 + retainedValueLogPruneObservedSourceSegmentsTotal atomic.Uint64 + retainedValueLogPruneObservedSourceBytesTotal atomic.Int64 + retainedValueLogPruneObservedSourceCandidateSegmentsTotal atomic.Uint64 + retainedValueLogPruneObservedSourceCandidateBytesTotal atomic.Int64 + retainedValueLogPruneObservedSourceRemovedSegmentsTotal atomic.Uint64 + retainedValueLogPruneObservedSourceRemovedBytesTotal atomic.Int64 + retainedValueLogPruneObservedSourceInUseSkippedSegmentsTotal atomic.Uint64 + retainedValueLogPruneObservedSourceInUseSkippedBytesTotal atomic.Int64 + retainedValueLogPruneObservedSourceLiveSkippedSegmentsTotal atomic.Uint64 + retainedValueLogPruneObservedSourceLiveSkippedBytesTotal atomic.Int64 + retainedValueLogPruneObservedSourceParseSkippedSegmentsTotal atomic.Uint64 + retainedValueLogPruneObservedSourceParseSkippedBytesTotal atomic.Int64 + retainedValueLogPruneObservedSourceZombieMarkedSegmentsTotal atomic.Uint64 + retainedValueLogPruneObservedSourceZombieMarkedBytesTotal atomic.Int64 + retainedValueLogPruneScheduleRequests atomic.Uint64 + retainedValueLogPruneScheduleForcedRequests atomic.Uint64 + retainedValueLogPruneScheduleSkipClosing atomic.Uint64 + retainedValueLogPruneScheduleSkipInFlight atomic.Uint64 + retainedValueLogPruneScheduleSkipNoClosedBytes atomic.Uint64 + retainedValueLogPruneScheduleSkipBelowPressure atomic.Uint64 + retainedValueLogPruneScheduleSkipMinInterval atomic.Uint64 + retainedValueLogPruneWriteGateRetries atomic.Uint64 + retainedValueLogPruneWriteGateRetrySuccesses atomic.Uint64 + retainedPruneForceRequested atomic.Bool + retainedPruneObservedMu sync.Mutex + retainedPruneObservedSourceIDs map[uint32]struct{} + vlogGenerationObservedGCMu sync.Mutex + vlogGenerationObservedGCSourceIDs map[uint32]struct{} + vlogGenerationObservedGCQueuedBatches atomic.Uint64 + vlogGenerationObservedGCQueuedIDs atomic.Uint64 + vlogGenerationObservedGCTakenBatches atomic.Uint64 + vlogGenerationObservedGCTakenIDs atomic.Uint64 + vlogGenerationObservedGCRuns atomic.Uint64 + vlogGenerationObservedGCRetryQueued atomic.Uint64 + vlogGenerationObservedGCRetryDropped atomic.Uint64 + vlogGenerationObservedGCRetryAttempts map[uint32]uint8 + vlogGenerationObservedGCFirstQueuedUnixNano map[uint32]int64 + vlogGenerationObservedGCLatencyCompletedIDs atomic.Uint64 + vlogGenerationObservedGCLatencyDroppedIDs atomic.Uint64 + vlogGenerationObservedGCLatencyTotalMS atomic.Uint64 + vlogGenerationObservedGCLatencyMaxMS atomic.Uint64 + vlogGenerationObservedGCSourceSegmentsTotal atomic.Uint64 + vlogGenerationObservedGCSourceSegmentsEligibleTotal atomic.Uint64 + vlogGenerationObservedGCSourceSegmentsDeletedTotal atomic.Uint64 + vlogGenerationObservedGCSourceSegmentsProtectedInUseTotal atomic.Uint64 + vlogGenerationObservedGCSourceSegmentsProtectedRetainedTotal atomic.Uint64 + vlogGenerationObservedGCSourceSegmentsProtectedOverlapTotal atomic.Uint64 + vlogGenerationObservedGCSourceSegmentsProtectedOtherTotal atomic.Uint64 + vlogGenerationObservedGCSourceBytesTotal atomic.Int64 + vlogGenerationObservedGCSourceBytesEligibleTotal atomic.Int64 + vlogGenerationObservedGCSourceBytesDeletedTotal atomic.Int64 + vlogGenerationObservedGCSourceBytesProtectedInUseTotal atomic.Int64 + vlogGenerationObservedGCSourceBytesProtectedRetainedTotal atomic.Int64 + vlogGenerationObservedGCSourceBytesProtectedOverlapTotal atomic.Int64 + vlogGenerationObservedGCSourceBytesProtectedOtherTotal atomic.Int64 + retainedPruneMu sync.Mutex + retainedPruneDone chan struct{} + vlogGenerationRemapSuccesses atomic.Uint64 + vlogGenerationRemapFailures atomic.Uint64 + vlogGenerationRewriteBytesIn atomic.Uint64 + vlogGenerationRewriteBytesOut atomic.Uint64 + vlogGenerationRewriteReclaimedBytes atomic.Uint64 + vlogGenerationRewriteValueRecordsCopied atomic.Uint64 + vlogGenerationRewriteValueBytesCopied atomic.Uint64 + vlogGenerationRewriteLeafRefRecordsCopied atomic.Uint64 + vlogGenerationRewriteLeafRefBytesCopied atomic.Uint64 + vlogGenerationRewriteProcessedLiveBytes atomic.Uint64 + vlogGenerationRewriteProcessedStaleBytes atomic.Uint64 + vlogGenerationRewriteNoReclaimRuns atomic.Uint64 + vlogGenerationRewriteNoReclaimStaleBytes atomic.Uint64 + vlogGenerationRewriteRuns atomic.Uint64 + vlogGenerationRewritePlanRuns atomic.Uint64 + vlogGenerationRewritePlanCanceled atomic.Uint64 + vlogGenerationRewritePlanErrors atomic.Uint64 + vlogGenerationRewritePlanEmpty atomic.Uint64 + vlogGenerationRewritePlanEmptyAgeBlocked atomic.Uint64 + vlogGenerationRewritePlanEmptyNoSelection atomic.Uint64 + vlogGenerationRewritePlanSelected atomic.Uint64 + vlogGenerationRewritePlanSelectedSegments atomic.Uint64 + vlogGenerationRewritePlanSelectedBytes atomic.Uint64 + vlogGenerationRewritePlanSelectedLiveBytes atomic.Uint64 + vlogGenerationRewritePlanSelectedStaleBytes atomic.Uint64 + vlogGenerationRewritePlanPenaltyFilterRuns atomic.Uint64 + vlogGenerationRewritePlanPenaltyFilterSegments atomic.Uint64 + vlogGenerationRewritePlanPenaltyFilterToEmpty atomic.Uint64 + vlogGenerationRewritePlanCanceledLastNS atomic.Int64 + vlogGenerationRewriteAgeBlockedUntilNS atomic.Int64 + vlogGenerationRewriteAgeBlockedWakeRunning atomic.Bool + vlogGenerationRewriteIneffectiveLastNS atomic.Int64 + vlogGenerationRewriteIneffectiveRuns atomic.Uint64 + vlogGenerationRewriteIneffectiveBytesIn atomic.Uint64 + vlogGenerationRewriteIneffectiveBytesOut atomic.Uint64 + vlogGenerationRewriteCanceledRuns atomic.Uint64 + vlogGenerationRewriteCanceledFreshPlanRuns atomic.Uint64 + vlogGenerationRewriteCanceledQueuedDebtRuns atomic.Uint64 + vlogGenerationRewriteCanceledLastNS atomic.Int64 + vlogGenerationRewriteDeadlineRuns atomic.Uint64 + vlogGenerationRewriteDeadlineFreshPlanRuns atomic.Uint64 + vlogGenerationRewriteDeadlineQueuedDebtRuns atomic.Uint64 + vlogGenerationRewriteDeadlineLastNS atomic.Int64 + vlogGenerationRewriteQueuePruneRuns atomic.Uint64 + vlogGenerationRewriteQueuePruneIDs atomic.Uint64 + vlogGenerationGCSegmentsDeleted atomic.Uint64 + vlogGenerationGCBytesDeleted atomic.Uint64 + vlogGenerationGCRuns atomic.Uint64 + vlogGenerationVacuumRuns atomic.Uint64 + vlogGenerationVacuumFailures atomic.Uint64 + vlogGenerationVacuumSkippedDisabled atomic.Uint64 + vlogGenerationVacuumSkippedRewriteBytes atomic.Uint64 + vlogGenerationVacuumSkippedCooldown atomic.Uint64 + vlogGenerationLastVacuumUnixNano atomic.Int64 + vlogGenerationLastRewritePlanUnixNano atomic.Int64 + vlogGenerationLastRewriteUnixNano atomic.Int64 + vlogGenerationLastGCUnixNano atomic.Int64 + vlogGenerationLastGCNoopUnixNano atomic.Int64 + vlogGenerationLastCheckpointKickUnixNano atomic.Int64 + vlogGenerationLastGCDryRunUnixNano atomic.Int64 + vlogGenerationLastGCDryRunBytesEligible atomic.Int64 + vlogGenerationLastGCDryRunSegsEligible atomic.Int64 + vlogGenerationLastGCBytesReferenced atomic.Int64 + vlogGenerationLastGCSegmentsReferenced atomic.Int64 + vlogGenerationLastGCBytesActive atomic.Int64 + vlogGenerationLastGCSegmentsActive atomic.Int64 + vlogGenerationLastGCBytesProtected atomic.Int64 + vlogGenerationLastGCSegmentsProtected atomic.Int64 + vlogGenerationLastGCBytesProtectedInUse atomic.Int64 + vlogGenerationLastGCSegmentsProtectedInUse atomic.Int64 + vlogGenerationLastGCBytesProtectedRetained atomic.Int64 + vlogGenerationLastGCSegmentsProtectedRetained atomic.Int64 + vlogGenerationLastGCBytesProtectedOverlap atomic.Int64 + vlogGenerationLastGCSegmentsProtectedOverlap atomic.Int64 + vlogGenerationLastGCBytesProtectedOther atomic.Int64 + vlogGenerationLastGCSegmentsProtectedOther atomic.Int64 + vlogGenerationLastGCBytesEligible atomic.Int64 + vlogGenerationLastGCSegmentsEligible atomic.Int64 + vlogGenerationLastGCBytesDeleted atomic.Int64 + vlogGenerationLastGCSegmentsDeleted atomic.Int64 + vlogGenerationLastGCBytesPending atomic.Int64 + vlogGenerationLastGCSegmentsPending atomic.Int64 + vlogGenerationLastGCObservedSourceSegments atomic.Int64 + vlogGenerationLastGCObservedSourceSegmentsReferenced atomic.Int64 + vlogGenerationLastGCObservedSourceSegmentsActive atomic.Int64 + vlogGenerationLastGCObservedSourceSegmentsProtected atomic.Int64 + vlogGenerationLastGCObservedSourceSegmentsProtectedInUse atomic.Int64 + vlogGenerationLastGCObservedSourceSegmentsProtectedRetained atomic.Int64 + vlogGenerationLastGCObservedSourceSegmentsProtectedOverlap atomic.Int64 + vlogGenerationLastGCObservedSourceSegmentsProtectedOther atomic.Int64 + vlogGenerationLastGCObservedSourceSegmentsEligible atomic.Int64 + vlogGenerationLastGCObservedSourceSegmentsDeleted atomic.Int64 + vlogGenerationLastGCObservedSourceSegmentsPending atomic.Int64 + vlogGenerationLastGCObservedSourceBytes atomic.Int64 + vlogGenerationLastGCObservedSourceBytesReferenced atomic.Int64 + vlogGenerationLastGCObservedSourceBytesActive atomic.Int64 + vlogGenerationLastGCObservedSourceBytesProtected atomic.Int64 + vlogGenerationLastGCObservedSourceBytesProtectedInUse atomic.Int64 + vlogGenerationLastGCObservedSourceBytesProtectedRetained atomic.Int64 + vlogGenerationLastGCObservedSourceBytesProtectedOverlap atomic.Int64 + vlogGenerationLastGCObservedSourceBytesProtectedOther atomic.Int64 + vlogGenerationLastGCObservedSourceBytesEligible atomic.Int64 + vlogGenerationLastGCObservedSourceBytesDeleted atomic.Int64 + vlogGenerationLastGCObservedSourceBytesPending atomic.Int64 + vlogGenerationChurnBytes atomic.Uint64 + vlogGenerationSchedulerState atomic.Uint32 + vlogGenerationMaintenanceActive atomic.Bool + vlogGenerationMaintenanceAttempts atomic.Uint64 + vlogGenerationMaintenanceAcquired atomic.Uint64 + vlogGenerationMaintenanceCollisions atomic.Uint64 + vlogGenerationMaintenanceSkipWALOnPeriodic atomic.Uint64 + vlogGenerationMaintenanceSkipPhase atomic.Uint64 + vlogGenerationMaintenanceSkipStageGate atomic.Uint64 + vlogGenerationMaintenanceSkipStageNotDue atomic.Uint64 + vlogGenerationMaintenanceSkipStageDue atomic.Uint64 + vlogGenerationMaintenanceSkipAgeBlocked atomic.Uint64 + vlogGenerationMaintenanceSkipPriority atomic.Uint64 + vlogGenerationMaintenanceSkipQuiet atomic.Uint64 + vlogGenerationMaintenanceSkipPreCheckpoint atomic.Uint64 + vlogGenerationMaintenanceSkipCheckpointing atomic.Uint64 + vlogGenerationMaintenancePassNoop atomic.Uint64 + vlogGenerationMaintenancePassWithRewrite atomic.Uint64 + vlogGenerationMaintenancePassWithGC atomic.Uint64 + vlogGenerationMaintenancePassTotalNanos atomic.Uint64 + vlogGenerationMaintenancePassMaxNanos atomic.Uint64 + vlogGenerationLastReason atomic.Uint32 + vlogGenerationCheckpointKickRuns atomic.Uint64 + vlogGenerationCheckpointKickRewriteRuns atomic.Uint64 + vlogGenerationCheckpointKickGCRuns atomic.Uint64 + vlogGenerationCheckpointKickSkippedHotNoDebt atomic.Uint64 + vlogGenerationCheckpointKickPending atomic.Bool + vlogGenerationDeferredMaintenancePending atomic.Bool + vlogGenerationDeferredMaintenanceRunning atomic.Bool + vlogGenerationRewriteStageWakeObservedNS atomic.Int64 + vlogGenerationRewriteQueueMu sync.Mutex + vlogGenerationCheckpointKickActive atomic.Bool + vlogGenerationRewriteQueue []uint32 + vlogGenerationRewriteLedger []backenddb.ValueLogRewritePlanSegment + vlogGenerationRewritePenalties map[uint32]valueLogGenerationRewritePenalty + vlogGenerationRewriteStagePending bool + vlogGenerationRewriteStageObservedUnixNano int64 + vlogGenerationRewriteQueueLoaded bool + vlogGenerationLastChurnBps atomic.Int64 + vlogGenerationLastChurnSampleBytes atomic.Uint64 + vlogGenerationLastChurnSampleNS atomic.Int64 // Rewrite budget token bucket (bytes) for online maintenance. This lets us // interpret ValueLogRewriteBudgetBytesPerSec as a true per-second bandwidth // budget while still running maintenance at coarse intervals. - vlogGenerationRewriteBudgetLastUnixNano atomic.Int64 - vlogGenerationRewriteBudgetTokensBytes atomic.Int64 - bgErrMu sync.Mutex - bgErr error + vlogGenerationRewriteBudgetLastUnixNano atomic.Int64 + vlogGenerationRewriteBudgetTokensBytes atomic.Int64 + vlogGenerationRewriteBudgetConsumed atomic.Uint64 + vlogGenerationRewritePlanTotalNanos atomic.Uint64 + vlogGenerationRewritePlanMaxNanos atomic.Uint64 + vlogGenerationRewriteExecTotalNanos atomic.Uint64 + vlogGenerationRewriteExecMaxNanos atomic.Uint64 + vlogGenerationRewriteExecSourceSegments atomic.Uint64 + vlogGenerationRewriteSourceSegmentsRequestedTotal atomic.Uint64 + vlogGenerationRewriteSourceSegmentsStillReferencedTotal atomic.Uint64 + vlogGenerationRewriteSourceSegmentsUnreferencedTotal atomic.Uint64 + vlogGenerationRewriteSourceSegmentsRequestedLast atomic.Uint64 + vlogGenerationRewriteSourceSegmentsStillReferencedLast atomic.Uint64 + vlogGenerationRewriteSourceSegmentsUnreferencedLast atomic.Uint64 + vlogGenerationGCExecTotalNanos atomic.Uint64 + vlogGenerationGCExecMaxNanos atomic.Uint64 + vlogGenerationVacuumExecTotalNanos atomic.Uint64 + vlogGenerationVacuumExecMaxNanos atomic.Uint64 + bgErrMu sync.Mutex + bgErr error // Backpressure state queueBacklogBytes atomic.Int64 @@ -5327,6 +6113,7 @@ const ( vlogGenerationGCMinBytes = int64(1 << 20) vlogGenerationRewriteMinInterval = 30 * time.Second vlogGenerationGCMinInterval = 45 * time.Second + vlogGenerationGCNoopMinInterval = 3 * time.Minute vlogGenerationCheckpointKickMinInterval = 5 * time.Second vlogGenerationCheckpointKickRetryWindow = 5 * time.Second vlogGenerationDeferredRetryWindow = 30 * time.Second @@ -5353,6 +6140,13 @@ const ( // Retained-path prune is opportunistic reclaim. Do not restart a full live-ID // scan on every periodic checkpoint during a hot workload. retainedPruneMinInterval = 30 * time.Second + // Rewrite-observed source IDs can quickly re-trigger forced retained-prune + // requests while replay GC is trying to converge. Allow a faster cadence for + // that targeted path without dropping the generic min-interval guard. + retainedPruneObservedMinInterval = 3 * time.Second + // Bound observed-source replay retries so a permanently retained-protected ID + // cannot stay queued forever when replay GC cannot make progress. + vlogGenerationObservedGCRetryMaxAttempts = uint8(3) // Coordinate index vacuum with major rewrite windows; do not run on every GC. vlogGenerationVacuumTriggerRewriteBytes = int64(64 << 20) vlogGenerationVacuumMinInterval = 5 * time.Minute @@ -5384,6 +6178,11 @@ const ( // During checkpoint-kick debt drain, allow a bounded multi-segment rewrite // selection so debt can converge faster than one-segment-per-pass. vlogGenerationRewriteDebtDrainMaxSegments = 8 + // Freshly planned rewrites normally execute one segment to limit immediate + // write amplification. In explicit debt-drain mode, allow a small burst once + // the queue is materially large so convergence does not stall. + vlogGenerationRewriteFreshPlanDebtDrainMinSegments = 4 + vlogGenerationRewriteFreshPlanDebtDrainMaxSegments = 4 ) func (db *DB) flushBackendEntriesCap(totalOps int, sync bool) int { @@ -6865,6 +7664,7 @@ func Open(dir string, backend BackendDB, opts Options) (*DB, error) { valueLogRewriteTriggerRatioPPM := opts.ValueLogRewriteTriggerStaleRatioPPM valueLogRewriteTriggerBytes := opts.ValueLogRewriteTriggerTotalBytes valueLogRewriteTriggerChurn := opts.ValueLogRewriteTriggerChurnPerSec + valueLogRewriteMinSegmentAge := opts.ValueLogRewriteMinSegmentAge if valueLogGenerationHotTarget < 0 { return nil, fmt.Errorf("cachingdb: invalid value-log generational hot segment target bytes %d", valueLogGenerationHotTarget) } @@ -6886,6 +7686,9 @@ func Open(dir string, backend BackendDB, opts Options) (*DB, error) { if valueLogRewriteTriggerChurn < 0 { return nil, fmt.Errorf("cachingdb: invalid value-log generational rewrite trigger churn/sec %d", valueLogRewriteTriggerChurn) } + if valueLogRewriteMinSegmentAge < 0 { + return nil, fmt.Errorf("cachingdb: invalid value-log generational rewrite min segment age %s", valueLogRewriteMinSegmentAge) + } if valueLogGenerationPolicyUint8 == uint8(backenddb.ValueLogGenerationHotWarmCold) { if valueLogGenerationHotTarget == 0 { valueLogGenerationHotTarget = defaultVlogGenerationHotTargetBytes @@ -6906,6 +7709,9 @@ func Open(dir string, backend BackendDB, opts Options) (*DB, error) { valueLogRewriteTriggerRatioPPM = defaultVlogRewriteTriggerStalePPM } } + if valueLogRewriteMinSegmentAge == 0 { + valueLogRewriteMinSegmentAge = vlogGenerationRewriteMinSegmentAge + } valueLogRawWritevMinAvgBytes := opts.ValueLogRawWritevMinAvgBytes if valueLogRawWritevMinAvgBytes < 0 { valueLogRawWritevMinAvgBytes = 0 @@ -7203,6 +8009,7 @@ func Open(dir string, backend BackendDB, opts Options) (*DB, error) { valueLogRewriteTriggerRatioPPM: valueLogRewriteTriggerRatioPPM, valueLogRewriteTriggerBytes: valueLogRewriteTriggerBytes, valueLogRewriteTriggerChurn: valueLogRewriteTriggerChurn, + valueLogRewriteMinSegmentAge: valueLogRewriteMinSegmentAge, memtableValueLogPointers: true, indexOuterLeavesInValueLog: opts.IndexOuterLeavesInValueLog, valueLogReader: valueLogReader, @@ -7315,6 +8122,9 @@ func Open(dir string, backend BackendDB, opts Options) (*DB, error) { } l.vlogClosedSizes[seg.path] = seg.size l.vlogClosedBytes.Add(seg.size) + if _, retained := db.valueLogRetain[seg.path]; retained { + db.valueLogRetainedClosedBytes.Add(seg.size) + } } else { if seg.path == l.walPath { continue @@ -12167,10 +12977,25 @@ func (db *DB) maybeRunPeriodicVlogGenerationMaintenance(runGC bool) bool { if db == nil { return false } + if db.vlogGenerationMaintenanceActive.Load() { + return false + } if db.suppressBackgroundVlogGenerationForMaintenancePhase() { db.debugVlogMaintf("periodic_skip reason=maintenance_phase phase=%s run_gc=%t", maintenancePhaseString(uint32(db.MaintenancePhase())), runGC) return false } + // Coarse preflight: while foreground activity is hot, avoid entering the + // maintenance engine unless a deferred/checkpoint wake is pending. Apply this + // to both rewrite and periodic GC ticks; otherwise runGC ticks can still + // issue expensive full scans every interval during restore-heavy sync phases. + now := time.Now() + quiet := db.foregroundActivityQuietFor(now, vlogGenerationMaintenanceQuietWindow, vlogForegroundReadQuietWindow) + if !quiet && + !db.vlogGenerationCheckpointKickPending.Load() && + !db.vlogGenerationDeferredMaintenancePending.Load() && + !db.vlogGenerationDeferredMaintenanceDue(now) { + return false + } db.maybeRunVlogGenerationMaintenance(runGC) return true } @@ -12254,7 +13079,7 @@ func (db *DB) vlogGenerationRewriteMaxSegmentsForRun(queueLen int, budgetTokens } // Checkpoint-kick retries should keep each debt-drain run small to reduce // write amplification when foreground ingest is still active. - if opts.bypassQuiet && !opts.skipCheckpoint { + if opts.bypassQuiet && !opts.skipCheckpoint && !vlogGenerationIsStageConfirmSource(opts) && !vlogGenerationIsAgeBlockedSource(opts) { return 1 } maxSegments = queueLen @@ -12290,6 +13115,23 @@ func (db *DB) vlogGenerationRewriteMaxSegmentsForRun(queueLen int, budgetTokens return maxSegments } +func (db *DB) vlogGenerationRewriteMaxSegmentsForFreshPlan(queueLen int, budgetTokens int64, opts vlogGenerationMaintenanceOptions) int { + if db == nil || queueLen <= 1 || !opts.rewriteDebtDrain { + return vlogGenerationRewriteResumeMaxSegments + } + if queueLen < vlogGenerationRewriteFreshPlanDebtDrainMinSegments { + return vlogGenerationRewriteResumeMaxSegments + } + maxSegments := db.vlogGenerationRewriteMaxSegmentsForRun(queueLen, budgetTokens, opts) + if maxSegments > vlogGenerationRewriteFreshPlanDebtDrainMaxSegments { + maxSegments = vlogGenerationRewriteFreshPlanDebtDrainMaxSegments + } + if maxSegments < 1 { + maxSegments = 1 + } + return maxSegments +} + const maxPositiveInt64 = int64(^uint64(0) >> 1) func addClampInt64(cur, add, limit int64) int64 { @@ -12367,6 +13209,9 @@ func (db *DB) vlogGenerationConsumeRewriteBudgetBytes(n int64) { next = 0 } if db.vlogGenerationRewriteBudgetTokensBytes.CompareAndSwap(cur, next) { + if consumed := cur - next; consumed > 0 { + db.vlogGenerationRewriteBudgetConsumed.Add(uint64(consumed)) + } return } } @@ -12401,6 +13246,112 @@ func sumVlogRewritePlanLiveBytes(segments []backenddb.ValueLogRewritePlanSegment return sum, ok } +func observeDurationNanos(total, max *atomic.Uint64, d time.Duration) { + if total == nil || max == nil || d <= 0 { + return + } + n := uint64(d) + total.Add(n) + updateAtomicMaxUint64(max, n) +} + +func (db *DB) observeVlogGenerationMaintenancePassDuration(d time.Duration) { + if db == nil { + return + } + observeDurationNanos(&db.vlogGenerationMaintenancePassTotalNanos, &db.vlogGenerationMaintenancePassMaxNanos, d) +} + +func (db *DB) observeVlogGenerationRewritePlanDuration(d time.Duration) { + if db == nil { + return + } + observeDurationNanos(&db.vlogGenerationRewritePlanTotalNanos, &db.vlogGenerationRewritePlanMaxNanos, d) +} + +func (db *DB) observeVlogGenerationRewriteExecDuration(d time.Duration) { + if db == nil { + return + } + observeDurationNanos(&db.vlogGenerationRewriteExecTotalNanos, &db.vlogGenerationRewriteExecMaxNanos, d) +} + +func (db *DB) observeVlogGenerationGCExecDuration(d time.Duration) { + if db == nil { + return + } + observeDurationNanos(&db.vlogGenerationGCExecTotalNanos, &db.vlogGenerationGCExecMaxNanos, d) +} + +func (db *DB) observeVlogGenerationGCStats(stats backenddb.ValueLogGCStats) { + if db == nil { + return + } + db.vlogGenerationLastGCBytesReferenced.Store(stats.BytesReferenced) + db.vlogGenerationLastGCSegmentsReferenced.Store(int64(stats.SegmentsReferenced)) + db.vlogGenerationLastGCBytesActive.Store(stats.BytesActive) + db.vlogGenerationLastGCSegmentsActive.Store(int64(stats.SegmentsActive)) + db.vlogGenerationLastGCBytesProtected.Store(stats.BytesProtected) + db.vlogGenerationLastGCSegmentsProtected.Store(int64(stats.SegmentsProtected)) + db.vlogGenerationLastGCBytesProtectedInUse.Store(stats.BytesProtectedInUse) + db.vlogGenerationLastGCSegmentsProtectedInUse.Store(int64(stats.SegmentsProtectedInUse)) + db.vlogGenerationLastGCBytesProtectedRetained.Store(stats.BytesProtectedRetained) + db.vlogGenerationLastGCSegmentsProtectedRetained.Store(int64(stats.SegmentsProtectedRetained)) + db.vlogGenerationLastGCBytesProtectedOverlap.Store(stats.BytesProtectedOverlap) + db.vlogGenerationLastGCSegmentsProtectedOverlap.Store(int64(stats.SegmentsProtectedOverlap)) + db.vlogGenerationLastGCBytesProtectedOther.Store(stats.BytesProtectedOther) + db.vlogGenerationLastGCSegmentsProtectedOther.Store(int64(stats.SegmentsProtectedOther)) + db.vlogGenerationLastGCBytesEligible.Store(stats.BytesEligible) + db.vlogGenerationLastGCSegmentsEligible.Store(int64(stats.SegmentsEligible)) + db.vlogGenerationLastGCBytesDeleted.Store(stats.BytesDeleted) + db.vlogGenerationLastGCSegmentsDeleted.Store(int64(stats.SegmentsDeleted)) + db.vlogGenerationLastGCBytesPending.Store(stats.BytesPending) + db.vlogGenerationLastGCSegmentsPending.Store(int64(stats.SegmentsPending)) + db.vlogGenerationLastGCObservedSourceSegments.Store(int64(stats.ObservedSourceSegments)) + db.vlogGenerationLastGCObservedSourceSegmentsReferenced.Store(int64(stats.ObservedSourceSegmentsReferenced)) + db.vlogGenerationLastGCObservedSourceSegmentsActive.Store(int64(stats.ObservedSourceSegmentsActive)) + db.vlogGenerationLastGCObservedSourceSegmentsProtected.Store(int64(stats.ObservedSourceSegmentsProtected)) + db.vlogGenerationLastGCObservedSourceSegmentsProtectedInUse.Store(int64(stats.ObservedSourceSegmentsProtectedInUse)) + db.vlogGenerationLastGCObservedSourceSegmentsProtectedRetained.Store(int64(stats.ObservedSourceSegmentsProtectedRetained)) + db.vlogGenerationLastGCObservedSourceSegmentsProtectedOverlap.Store(int64(stats.ObservedSourceSegmentsProtectedOverlap)) + db.vlogGenerationLastGCObservedSourceSegmentsProtectedOther.Store(int64(stats.ObservedSourceSegmentsProtectedOther)) + db.vlogGenerationLastGCObservedSourceSegmentsEligible.Store(int64(stats.ObservedSourceSegmentsEligible)) + db.vlogGenerationLastGCObservedSourceSegmentsDeleted.Store(int64(stats.ObservedSourceSegmentsDeleted)) + db.vlogGenerationLastGCObservedSourceSegmentsPending.Store(int64(stats.ObservedSourceSegmentsPending)) + db.vlogGenerationLastGCObservedSourceBytes.Store(stats.ObservedSourceBytes) + db.vlogGenerationLastGCObservedSourceBytesReferenced.Store(stats.ObservedSourceBytesReferenced) + db.vlogGenerationLastGCObservedSourceBytesActive.Store(stats.ObservedSourceBytesActive) + db.vlogGenerationLastGCObservedSourceBytesProtected.Store(stats.ObservedSourceBytesProtected) + db.vlogGenerationLastGCObservedSourceBytesProtectedInUse.Store(stats.ObservedSourceBytesProtectedInUse) + db.vlogGenerationLastGCObservedSourceBytesProtectedRetained.Store(stats.ObservedSourceBytesProtectedRetained) + db.vlogGenerationLastGCObservedSourceBytesProtectedOverlap.Store(stats.ObservedSourceBytesProtectedOverlap) + db.vlogGenerationLastGCObservedSourceBytesProtectedOther.Store(stats.ObservedSourceBytesProtectedOther) + db.vlogGenerationLastGCObservedSourceBytesEligible.Store(stats.ObservedSourceBytesEligible) + db.vlogGenerationLastGCObservedSourceBytesDeleted.Store(stats.ObservedSourceBytesDeleted) + db.vlogGenerationLastGCObservedSourceBytesPending.Store(stats.ObservedSourceBytesPending) + db.vlogGenerationObservedGCSourceSegmentsTotal.Add(uint64(stats.ObservedSourceSegments)) + db.vlogGenerationObservedGCSourceSegmentsEligibleTotal.Add(uint64(stats.ObservedSourceSegmentsEligible)) + db.vlogGenerationObservedGCSourceSegmentsDeletedTotal.Add(uint64(stats.ObservedSourceSegmentsDeleted)) + db.vlogGenerationObservedGCSourceSegmentsProtectedInUseTotal.Add(uint64(stats.ObservedSourceSegmentsProtectedInUse)) + db.vlogGenerationObservedGCSourceSegmentsProtectedRetainedTotal.Add(uint64(stats.ObservedSourceSegmentsProtectedRetained)) + db.vlogGenerationObservedGCSourceSegmentsProtectedOverlapTotal.Add(uint64(stats.ObservedSourceSegmentsProtectedOverlap)) + db.vlogGenerationObservedGCSourceSegmentsProtectedOtherTotal.Add(uint64(stats.ObservedSourceSegmentsProtectedOther)) + db.vlogGenerationObservedGCSourceBytesTotal.Add(stats.ObservedSourceBytes) + db.vlogGenerationObservedGCSourceBytesEligibleTotal.Add(stats.ObservedSourceBytesEligible) + db.vlogGenerationObservedGCSourceBytesDeletedTotal.Add(stats.ObservedSourceBytesDeleted) + db.vlogGenerationObservedGCSourceBytesProtectedInUseTotal.Add(stats.ObservedSourceBytesProtectedInUse) + db.vlogGenerationObservedGCSourceBytesProtectedRetainedTotal.Add(stats.ObservedSourceBytesProtectedRetained) + db.vlogGenerationObservedGCSourceBytesProtectedOverlapTotal.Add(stats.ObservedSourceBytesProtectedOverlap) + db.vlogGenerationObservedGCSourceBytesProtectedOtherTotal.Add(stats.ObservedSourceBytesProtectedOther) +} + +func (db *DB) observeVlogGenerationVacuumExecDuration(d time.Duration) { + if db == nil { + return + } + observeDurationNanos(&db.vlogGenerationVacuumExecTotalNanos, &db.vlogGenerationVacuumExecMaxNanos, d) +} + func vlogGenerationRewriteLedgerIDs(segments []backenddb.ValueLogRewritePlanSegment) []uint32 { if len(segments) == 0 { return nil @@ -12416,9 +13367,14 @@ func vlogGenerationRewriteLedgerIDs(segments []backenddb.ValueLogRewritePlanSegm } func (db *DB) observeVlogGenerationRewritePlanOutcome(plan backenddb.ValueLogRewritePlan, err error) { + db.observeVlogGenerationRewritePlanOutcomeWithDuration(plan, err, 0) +} + +func (db *DB) observeVlogGenerationRewritePlanOutcomeWithDuration(plan backenddb.ValueLogRewritePlan, err error, dur time.Duration) { if db == nil { return } + db.observeVlogGenerationRewritePlanDuration(dur) db.vlogGenerationRewritePlanRuns.Add(1) if err != nil { if isVlogGenerationPlannerCanceled(err) { @@ -12431,9 +13387,74 @@ func (db *DB) observeVlogGenerationRewritePlanOutcome(plan backenddb.ValueLogRew } if len(plan.SourceFileIDs) > 0 || len(plan.SelectedSegments) > 0 || plan.SegmentsSelected > 0 { db.vlogGenerationRewritePlanSelected.Add(1) + selectedSegments := plan.SegmentsSelected + if selectedSegments <= 0 { + switch { + case len(plan.SelectedSegments) > 0: + selectedSegments = len(plan.SelectedSegments) + case len(plan.SourceFileIDs) > 0: + selectedSegments = len(plan.SourceFileIDs) + } + } + if selectedSegments > 0 { + db.vlogGenerationRewritePlanSelectedSegments.Add(uint64(selectedSegments)) + } + selectedTotal := plan.SelectedBytesTotal + selectedLive := plan.SelectedBytesLive + selectedStale := plan.SelectedBytesStale + if len(plan.SelectedSegments) > 0 && (selectedTotal <= 0 || selectedLive <= 0 || selectedStale <= 0) { + fallbackTotal := int64(0) + fallbackLive := int64(0) + fallbackStale := int64(0) + for _, seg := range plan.SelectedSegments { + if seg.BytesTotal > 0 { + fallbackTotal += seg.BytesTotal + } + if seg.BytesLive > 0 { + fallbackLive += seg.BytesLive + } + if seg.BytesStale > 0 { + fallbackStale += seg.BytesStale + } + } + if selectedTotal <= 0 { + selectedTotal = fallbackTotal + } + if selectedLive <= 0 { + selectedLive = fallbackLive + } + if selectedStale <= 0 { + selectedStale = fallbackStale + } + } + if selectedTotal > 0 { + db.vlogGenerationRewritePlanSelectedBytes.Add(uint64(selectedTotal)) + } + if selectedLive > 0 { + db.vlogGenerationRewritePlanSelectedLiveBytes.Add(uint64(selectedLive)) + } + if selectedStale > 0 { + db.vlogGenerationRewritePlanSelectedStaleBytes.Add(uint64(selectedStale)) + } return } db.vlogGenerationRewritePlanEmpty.Add(1) + if plan.AgeBlockedSegments > 0 && plan.AgeBlockedMinRemainingAge > 0 { + db.vlogGenerationRewritePlanEmptyAgeBlocked.Add(1) + } else { + db.vlogGenerationRewritePlanEmptyNoSelection.Add(1) + } +} + +func (db *DB) observeVlogGenerationRewritePlanPenaltyFilter(before, after int) { + if db == nil || before <= 0 || after >= before { + return + } + db.vlogGenerationRewritePlanPenaltyFilterRuns.Add(1) + db.vlogGenerationRewritePlanPenaltyFilterSegments.Add(uint64(before - after)) + if after == 0 { + db.vlogGenerationRewritePlanPenaltyFilterToEmpty.Add(1) + } } func isVlogGenerationPlannerCanceled(err error) bool { @@ -12451,14 +13472,32 @@ func (db *DB) vlogGenerationRewritePlanBackoffActive(now time.Time) bool { return now.Sub(time.Unix(0, lastCanceled)) < vlogGenerationRewritePlanCancelBackoff } -func (db *DB) observeVlogGenerationRewriteCanceled() { +func (db *DB) observeVlogGenerationRewriteCanceled(queuedDebt bool) { if db == nil { return } db.vlogGenerationRewriteCanceledRuns.Add(1) + if queuedDebt { + db.vlogGenerationRewriteCanceledQueuedDebtRuns.Add(1) + } else { + db.vlogGenerationRewriteCanceledFreshPlanRuns.Add(1) + } db.vlogGenerationRewriteCanceledLastNS.Store(time.Now().UnixNano()) } +func (db *DB) observeVlogGenerationRewriteDeadline(queuedDebt bool) { + if db == nil { + return + } + db.vlogGenerationRewriteDeadlineRuns.Add(1) + if queuedDebt { + db.vlogGenerationRewriteDeadlineQueuedDebtRuns.Add(1) + } else { + db.vlogGenerationRewriteDeadlineFreshPlanRuns.Add(1) + } + db.vlogGenerationRewriteDeadlineLastNS.Store(time.Now().UnixNano()) +} + func (db *DB) observeVlogGenerationRewriteQueuePrune(dropped int) { if db == nil || dropped <= 0 { return @@ -12886,7 +13925,7 @@ func (db *DB) scheduleDueVlogGenerationDeferredMaintenance() { } func (db *DB) runVlogGenerationCheckpointKickRetries(opts vlogGenerationMaintenanceOptions) { - db.runVlogGenerationMaintenanceRetries(opts, vlogGenerationCheckpointKickRetryWindow, false) + db.runVlogGenerationMaintenanceRetries(opts, vlogGenerationCheckpointKickRetryWindow, true) } func (db *DB) runVlogGenerationMaintenanceRetries(opts vlogGenerationMaintenanceOptions, retryWindow time.Duration, stopWhenAcquired bool) { @@ -12910,6 +13949,22 @@ func (db *DB) runVlogGenerationMaintenanceRetries(opts vlogGenerationMaintenance deadline := time.Now().Add(retryWindow) sleepDelay := 10 * time.Millisecond for !db.closing.Load() { + // Retry loops should never hammer an already-active maintenance pass. + // Wait for release/deadline instead of repeatedly colliding and inflating + // maintenance.attempts/collisions under hot checkpoint-kick activity. + if db.vlogGenerationMaintenanceActive.Load() { + if time.Now().After(deadline) { + return + } + time.Sleep(sleepDelay) + if sleepDelay < 100*time.Millisecond { + sleepDelay *= 2 + if sleepDelay > 100*time.Millisecond { + sleepDelay = 100 * time.Millisecond + } + } + continue + } attempt++ if opts.debugSource != "" { db.debugVlogMaintf( @@ -12921,7 +13976,11 @@ func (db *DB) runVlogGenerationMaintenanceRetries(opts vlogGenerationMaintenance db.vlogGenerationMaintenanceActive.Load(), ) } - ran := db.maybeRunVlogGenerationMaintenanceWithOptions(true, opts) + // Retry-driven maintenance (checkpoint kick / deferred stage confirmation) + // prioritizes rewrite debt progress. Keep periodic/full-scan GC on the + // normal scheduler path to avoid introducing long full-scan stalls on hot + // checkpoint-triggered retries. + ran := db.maybeRunVlogGenerationMaintenanceWithOptions(false, opts) if stopWhenAcquired && ran { if opts.debugSource != "" { db.debugVlogMaintf( @@ -12975,14 +14034,17 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog if db == nil || db.closing.Load() || db.valueLogGenerationPolicy != uint8(backenddb.ValueLogGenerationHotWarmCold) { return } + db.vlogGenerationMaintenanceAttempts.Add(1) // In WAL-on mode, the periodic "runGC" tick must not enter the maintenance // engine at all. Checkpoint-coupled work belongs to the explicit // checkpoint-kick/deferred paths; letting the periodic GC tick even acquire // maintenanceActive can strand that slot behind hot restore-time locks. if runGC && !db.disableJournal && !opts.bypassQuiet { + db.vlogGenerationMaintenanceSkipWALOnPeriodic.Add(1) return } if db.suppressBackgroundVlogGenerationForMaintenancePhase() { + db.vlogGenerationMaintenanceSkipPhase.Add(1) if opts.debugSource != "" { db.debugVlogMaintf( "maintenance_skip reason=maintenance_phase source=%s phase=%s checkpoint_pending=%t deferred_pending=%t", @@ -12998,6 +14060,7 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog // checkpoint-kick path can race otherwise, which causes overlapping rewrite // runs to compete on the same resume queue. if !db.vlogGenerationMaintenanceActive.CompareAndSwap(false, true) { + db.vlogGenerationMaintenanceCollisions.Add(1) // Checkpoint-kick retries are high-priority and quiet-window-bypassed by // design. If they collide with an active pass, queue exactly one retry to // run right after the active pass exits. @@ -13016,6 +14079,9 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog return } acquired = true + db.vlogGenerationMaintenanceAcquired.Add(1) + rewriteRunsBefore := db.vlogGenerationRewriteRuns.Load() + gcRunsBefore := db.vlogGenerationGCRuns.Load() activeSource := vlogGenerationMaintenanceDebugSource(opts) activeStart := time.Now() db.debugVlogMaintf( @@ -13028,19 +14094,32 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog db.vlogGenerationDeferredMaintenancePending.Load(), ) defer func() { + passDur := time.Since(activeStart) db.debugVlogMaintf( "maintenance_active_release source=%s dur_ms=%d checkpoint_pending=%t deferred_pending=%t", activeSource, - time.Since(activeStart).Milliseconds(), + passDur.Milliseconds(), db.vlogGenerationCheckpointKickPending.Load(), db.vlogGenerationDeferredMaintenancePending.Load(), ) + db.observeVlogGenerationMaintenancePassDuration(passDur) db.vlogGenerationMaintenanceActive.Store(false) // If a deferred confirmation/age wake became due while this pass held the // scheduler active, requeue it immediately on exit instead of relying on // the original retry goroutine to still be alive. db.scheduleDueVlogGenerationDeferredMaintenance() db.schedulePendingVlogGenerationCheckpointKick() + rewriteRan := db.vlogGenerationRewriteRuns.Load() > rewriteRunsBefore + gcRan := db.vlogGenerationGCRuns.Load() > gcRunsBefore + if rewriteRan { + db.vlogGenerationMaintenancePassWithRewrite.Add(1) + } + if gcRan { + db.vlogGenerationMaintenancePassWithGC.Add(1) + } + if !rewriteRan && !gcRan { + db.vlogGenerationMaintenancePassNoop.Add(1) + } }() now := time.Now() quiet := db.foregroundActivityQuietFor(now, vlogGenerationMaintenanceQuietWindow, vlogForegroundReadQuietWindow) @@ -13083,16 +14162,21 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog // passes spend long maintenance windows before the confirmation delay // has elapsed. The only valid next step is to wait for confirmation. if !vlogGenerationIsStageConfirmSource(opts) { + db.vlogGenerationMaintenanceSkipStageGate.Add(1) + db.vlogGenerationMaintenanceSkipStageNotDue.Add(1) return } } else if !vlogGenerationIsStageConfirmSource(opts) { // When confirmation becomes due, reserve the maintenance slot for the // explicit stage-confirm wake instead of letting generic retries or // periodic passes reacquire it first. + db.vlogGenerationMaintenanceSkipStageGate.Add(1) + db.vlogGenerationMaintenanceSkipStageDue.Add(1) return } } if !stagePending && ageBlockedDue && !vlogGenerationIsAgeBlockedSource(opts) { + db.vlogGenerationMaintenanceSkipAgeBlocked.Add(1) return } // Checkpoint-collision retries and timer-driven confirmation wakes should run @@ -13100,11 +14184,13 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog // collisions where periodic maintenance keeps reacquiring the scheduler while // the higher-priority retry is still trying to run. if !opts.bypassQuiet && (db.vlogGenerationCheckpointKickPending.Load() || db.vlogGenerationDeferredMaintenancePending.Load()) { + db.vlogGenerationMaintenanceSkipPriority.Add(1) return } // Explicit GC runs bypass the foreground quiet-window gate so callers can // force a safety/cleanup pass even while foreground activity is ongoing. if !runGC && !opts.bypassQuiet && !quiet { + db.vlogGenerationMaintenanceSkipQuiet.Add(1) return } // In WAL-off mode, do not start rewrite/GC planning before the first @@ -13114,7 +14200,9 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog // caused real restore stalls. Keep WAL-on profiles eligible for maintenance // before the first checkpoint; starving that path causes the main value-log // lane to grow unchecked during restore. - if db.disableJournal && db.checkpointRuns.Load() == 0 && !runGC && len(rewriteQueue) == 0 && !opts.skipCheckpoint { + allowPreCheckpointRewrite := envBool(envEnableVlogGenerationPreCheckpointRewrite) + if db.disableJournal && db.checkpointRuns.Load() == 0 && !runGC && len(rewriteQueue) == 0 && !opts.skipCheckpoint && !allowPreCheckpointRewrite { + db.vlogGenerationMaintenanceSkipPreCheckpoint.Add(1) return } // Retained-prune and generation maintenance use the same foreground quiet-window gate. @@ -13135,10 +14223,12 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog return } } else { + db.vlogGenerationMaintenanceSkipCheckpointing.Add(1) return } } if db.checkpointing.Load() { + db.vlogGenerationMaintenanceSkipCheckpointing.Add(1) return } now = time.Now() @@ -13250,11 +14340,12 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog MaxSourceBytes: maxSourceBytes, MinSegmentStaleRatio: minStaleRatio, MinSegmentStaleBytes: 1, - MinSegmentAge: vlogGenerationRewriteMinSegmentAge, + MinSegmentAge: db.valueLogRewriteMinSegmentAge, } planStart := time.Now() plan, err := planner.ValueLogRewritePlan(ctx, planOpts) cancel() + planDur := time.Since(planStart) db.debugVlogMaintf( "rewrite_plan stale_ratio_trigger min_ratio=%.6f max_source_bytes=%d selected=%d/%d selected_bytes_total=%d selected_bytes_live=%d selected_bytes_stale=%d total_bytes=%d live_bytes=%d stale_bytes=%d dur_ms=%.3f err=%v", minStaleRatio, @@ -13267,10 +14358,10 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog plan.BytesTotal, plan.BytesLive, plan.BytesStale, - float64(time.Since(planStart).Microseconds())/1000, + float64(planDur.Microseconds())/1000, err, ) - db.observeVlogGenerationRewritePlanOutcome(plan, err) + db.observeVlogGenerationRewritePlanOutcomeWithDuration(plan, err, planDur) updatePlanTimestamp := false if err != nil { db.clearVlogGenerationRewriteAgeBlockedUntil() @@ -13292,6 +14383,7 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog } } else if len(plan.SourceFileIDs) > 0 { db.clearVlogGenerationRewriteAgeBlockedUntil() + beforePenaltyFilter := len(plan.SourceFileIDs) plan, err = db.filterVlogGenerationRewritePlanPenalties(plan, now) if err != nil { db.vlogGenerationSchedulerState.Store(vlogGenerationSchedulerError) @@ -13300,6 +14392,7 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog } return } + db.observeVlogGenerationRewritePlanPenaltyFilter(beforePenaltyFilter, len(plan.SourceFileIDs)) updatePlanTimestamp = true if len(plan.SourceFileIDs) > 0 { if stagePending { @@ -13313,7 +14406,9 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog } confirmed := stableVlogGenerationRewriteLedgerSegments(stagedLedger, plan.SelectedSegments) if len(confirmed) > 0 { - plan = filterVlogGenerationRewritePlanToSegments(plan, confirmed) + // Treat confirmation overlap as a stability signal, then run + // the current sparse plan (not just the overlap subset) so live + // maintenance can make forward progress within short sync windows. shouldRewrite = true reason = vlogGenerationReasonRewriteResume } else { @@ -13404,9 +14499,10 @@ planned: MaxSourceBytes: maxSourceBytes, MinSegmentStaleRatio: minStaleRatio, MinSegmentStaleBytes: vlogGenerationRewriteMinSegmentStaleBytes, - MinSegmentAge: vlogGenerationRewriteMinSegmentAge, + MinSegmentAge: db.valueLogRewriteMinSegmentAge, }) cancel() + planDur := time.Since(planStart) db.debugVlogMaintf( "rewrite_plan pre_rewrite max_source_bytes=%d min_ratio=%.6f min_stale_bytes=%d selected=%d/%d selected_bytes_total=%d selected_bytes_live=%d selected_bytes_stale=%d total_bytes=%d live_bytes=%d stale_bytes=%d dur_ms=%.3f err=%v", maxSourceBytes, @@ -13420,10 +14516,10 @@ planned: plan.BytesTotal, plan.BytesLive, plan.BytesStale, - float64(time.Since(planStart).Microseconds())/1000, + float64(planDur.Microseconds())/1000, err, ) - db.observeVlogGenerationRewritePlanOutcome(plan, err) + db.observeVlogGenerationRewritePlanOutcomeWithDuration(plan, err, planDur) if err != nil { db.clearVlogGenerationRewriteAgeBlockedUntil() if isVlogGenerationPlannerCanceled(err) { @@ -13444,6 +14540,7 @@ planned: } if len(plan.SourceFileIDs) > 0 { db.clearVlogGenerationRewriteAgeBlockedUntil() + beforePenaltyFilter := len(plan.SourceFileIDs) plan, err = db.filterVlogGenerationRewritePlanPenalties(plan, now) if err != nil { db.vlogGenerationSchedulerState.Store(vlogGenerationSchedulerError) @@ -13453,16 +14550,17 @@ planned: } return } + db.observeVlogGenerationRewritePlanPenaltyFilter(beforePenaltyFilter, len(plan.SourceFileIDs)) } if len(plan.SourceFileIDs) == 0 { - if shouldDeferVlogGenerationRewritePlanForAge(plan, vlogGenerationRewriteMinSegmentAge) { + if shouldDeferVlogGenerationRewritePlanForAge(plan, db.valueLogRewriteMinSegmentAge) { db.setVlogGenerationRewriteAgeBlockedUntil(now.Add(plan.AgeBlockedMinRemainingAge)) db.debugVlogMaintf( "rewrite_plan pre_rewrite age_blocked segments=%d stale_bytes=%d retry_after_ms=%d min_age_ms=%d", plan.AgeBlockedSegments, plan.AgeBlockedBytesStale, plan.AgeBlockedMinRemainingAge.Milliseconds(), - vlogGenerationRewriteMinSegmentAge.Milliseconds(), + db.valueLogRewriteMinSegmentAge.Milliseconds(), ) } else { db.clearVlogGenerationRewriteAgeBlockedUntil() @@ -13589,9 +14687,15 @@ planned: } } rewriteQueue = append([]uint32(nil), rewritePlan.SourceFileIDs...) - // Do not debt-drain freshly planned work in the same pass; only apply - // multi-segment debt-drain to explicit resume queues. - rewriteMaxSegments = vlogGenerationRewriteResumeMaxSegments + // Do not debt-drain freshly planned work in the same pass. The only + // exception is a confirmed staged rewrite-resume pass, which should + // be allowed to consume debt in bounded multi-segment chunks. + allowPlanDebtDrain := reason == vlogGenerationReasonRewriteResume && opts.rewriteDebtDrain + if allowPlanDebtDrain { + rewriteMaxSegments = db.vlogGenerationRewriteMaxSegmentsForRun(len(rewriteQueue), budgetTokens, opts) + } else { + rewriteMaxSegments = db.vlogGenerationRewriteMaxSegmentsForFreshPlan(len(rewriteQueue), budgetTokens, opts) + } // If the token bucket is enabled and empty, persist the plan/ledger but // skip running the rewrite until we have budget to spend. if db.vlogGenerationRewriteBudgetEnabled() && budgetTokens <= 0 { @@ -13673,14 +14777,14 @@ planned: rewriteOpts.MaxSourceBytes = maxSourceBytes rewriteOpts.MinSegmentStaleRatio = db.vlogGenerationRewriteMinStaleRatioForGenericPass(totalBytes) rewriteOpts.MinSegmentStaleBytes = vlogGenerationRewriteMinSegmentStaleBytes - rewriteOpts.MinSegmentAge = vlogGenerationRewriteMinSegmentAge + rewriteOpts.MinSegmentAge = db.valueLogRewriteMinSegmentAge } var ctx context.Context var cancel context.CancelFunc - if hadRewriteQueue && len(processedRewriteIDs) > 0 { + if len(processedRewriteIDs) > 0 { ctx, cancel = context.WithTimeout(context.Background(), vlogGenerationRewriteBoundedExecTimeout) } else { - ctx, cancel = db.foregroundMaintenanceContext(2 * time.Minute) + ctx, cancel = db.vlogGenerationMaintenanceContext(2*time.Minute, opts) } db.debugVlogMaintf( "rewrite_exec reason=%s source_ids=%d max_segments=%d budget_tokens=%d max_source_bytes=%d min_stale_ratio=%.6f queue_len=%d ledger_live_bytes=%d", @@ -13696,16 +14800,21 @@ planned: rewriteStart := time.Now() stats, err := rewriter.ValueLogRewriteOnline(ctx, rewriteOpts) cancel() + rewriteDur := time.Since(rewriteStart) + db.observeVlogGenerationRewriteExecDuration(rewriteDur) if err != nil { - db.debugVlogMaintf("rewrite_err reason=%s err=%v dur_ms=%.3f", vlogGenerationReasonString(reason), err, float64(time.Since(rewriteStart).Microseconds())/1000) + db.debugVlogMaintf("rewrite_err reason=%s err=%v dur_ms=%.3f", vlogGenerationReasonString(reason), err, float64(rewriteDur.Microseconds())/1000) + queuedDebt := hadRewriteQueue && len(processedRewriteIDs) > 0 if errors.Is(err, context.Canceled) { - db.observeVlogGenerationRewriteCanceled() + db.observeVlogGenerationRewriteCanceled(queuedDebt) if len(processedRewriteIDs) > 0 { // A canceled rewrite that already selected a queued chunk should // immediately queue a checkpoint-kick retry. The retry executes // as resumable debt with bounded non-cancel semantics. db.vlogGenerationCheckpointKickPending.Store(true) } + } else if errors.Is(err, context.DeadlineExceeded) { + db.observeVlogGenerationRewriteDeadline(queuedDebt) } return fmt.Errorf("generational rewrite: %w", err) } @@ -13717,7 +14826,7 @@ planned: stats.BytesBefore, stats.BytesAfter, stats.RecordsCopied, - float64(time.Since(rewriteStart).Microseconds())/1000, + float64(rewriteDur.Microseconds())/1000, ) effectiveBytesBefore := int64(stats.BytesBefore) effectiveBytesAfter := int64(stats.BytesAfter) @@ -13728,26 +14837,154 @@ planned: } } if gcer, ok := db.backend.(backendValueLogGCer); ok { - gcCtx, gcCancel := context.WithTimeout(context.Background(), 30*time.Second) - gcStart := time.Now() - gcStats, gcErr := gcer.ValueLogGC(gcCtx, backenddb.ValueLogGCOptions{ - ProtectedPaths: db.valueLogProtectedPaths(), - }) - gcCancel() + gcOpts := db.valueLogGCOptions(false) + if len(processedRewriteIDs) > 0 { + gcOpts.ObservedSourceFileIDs = append([]uint32(nil), processedRewriteIDs...) + } + runGC := func(phase string) (backenddb.ValueLogGCStats, error) { + gcCtx, gcCancel := context.WithTimeout(context.Background(), 30*time.Second) + gcStart := time.Now() + gcStats, gcErr := gcer.ValueLogGC(gcCtx, gcOpts) + gcCancel() + gcDur := time.Since(gcStart) + db.observeVlogGenerationGCExecDuration(gcDur) + if gcErr != nil { + db.debugVlogMaintf( + "gc_after_rewrite_err reason=%s phase=%s err=%v dur_ms=%.3f", + vlogGenerationReasonString(reason), + phase, + gcErr, + float64(gcDur.Microseconds())/1000, + ) + return backenddb.ValueLogGCStats{}, gcErr + } + db.observeVlogGenerationGCStats(gcStats) + db.vlogGenerationGCRuns.Add(1) + if gcStats.SegmentsDeleted > 0 { + db.vlogGenerationGCSegmentsDeleted.Add(uint64(gcStats.SegmentsDeleted)) + } + if gcStats.BytesDeleted > 0 { + db.vlogGenerationGCBytesDeleted.Add(uint64(gcStats.BytesDeleted)) + gcBytesDeleted += int64(gcStats.BytesDeleted) + effectiveBytesAfter -= int64(gcStats.BytesDeleted) + if effectiveBytesAfter < 0 { + effectiveBytesAfter = 0 + } + } + db.debugVlogMaintf( + "gc_after_rewrite_done reason=%s phase=%s dur_ms=%.3f", + vlogGenerationReasonString(reason), + phase, + float64(gcDur.Microseconds())/1000, + ) + return gcStats, nil + } + + gcStats, gcErr := runGC("initial") if gcErr != nil { - db.debugVlogMaintf("gc_after_rewrite_err reason=%s err=%v dur_ms=%.3f", vlogGenerationReasonString(reason), gcErr, float64(time.Since(gcStart).Microseconds())/1000) return fmt.Errorf("generational gc after rewrite: %w", gcErr) } - if gcStats.BytesDeleted > 0 { - gcBytesDeleted = int64(gcStats.BytesDeleted) - effectiveBytesAfter -= gcBytesDeleted - if effectiveBytesAfter < 0 { - effectiveBytesAfter = 0 + + rewriteBlockedByRetained := len(processedRewriteIDs) > 0 && + gcStats.ObservedSourceSegments > 0 && + gcStats.ObservedSourceSegmentsReferenced == 0 && + gcStats.ObservedSourceSegmentsEligible == 0 && + gcStats.ObservedSourceSegmentsProtectedRetained > 0 + if rewriteBlockedByRetained { + if db.retainedPruneActive() { + db.queueRetainedPruneObservedSourceIDs(processedRewriteIDs) + // A prune is already in flight. Ensure a follow-up attempt stays queued. + db.scheduleRetainedValueLogPruneForce() + // Request a follow-up maintenance pass so GC can re-evaluate + // rewrite-observed source segments after the in-flight prune completes. + db.vlogGenerationCheckpointKickPending.Store(true) + } else { + observedSourceIDSet := make(map[uint32]struct{}, len(processedRewriteIDs)) + for _, id := range processedRewriteIDs { + if id == 0 { + continue + } + observedSourceIDSet[id] = struct{}{} + } + nowPrune := time.Now() + db.retainedPruneLastStartUnixNano.Store(nowPrune.UnixNano()) + db.retainedValueLogPruneRuns.Add(1) + db.retainedValueLogPruneForcedRuns.Add(1) + db.retainedValueLogPruneLastUnixNano.Store(nowPrune.UnixNano()) + pruneStats := db.pruneRetainedValueLogsWithObserved(true, observedSourceIDSet) + db.observeRetainedValueLogPruneStats(pruneStats) + db.debugVlogMaintf( + "rewrite_retained_prune reason=%s observed_source_retained_segments=%d observed_source_retained_bytes=%d observed_source_seen_segments=%d observed_source_seen_bytes=%d observed_source_candidate_segments=%d observed_source_candidate_bytes=%d observed_source_removed_segments=%d observed_source_removed_bytes=%d observed_source_zombie_marked_segments=%d observed_source_zombie_marked_bytes=%d observed_source_live_skipped_segments=%d observed_source_live_skipped_bytes=%d observed_source_in_use_skipped_segments=%d observed_source_in_use_skipped_bytes=%d observed_source_parse_skipped_segments=%d observed_source_parse_skipped_bytes=%d removed_segments=%d removed_bytes=%d zombie_marked_segments=%d zombie_marked_bytes=%d live_skipped_segments=%d live_skipped_bytes=%d aborted=%t", + vlogGenerationReasonString(reason), + gcStats.ObservedSourceSegmentsProtectedRetained, + gcStats.ObservedSourceBytesProtectedRetained, + pruneStats.ObservedSourceSegments, + pruneStats.ObservedSourceBytes, + pruneStats.ObservedSourceCandidateSegments, + pruneStats.ObservedSourceCandidateBytes, + pruneStats.ObservedSourceRemovedSegments, + pruneStats.ObservedSourceRemovedBytes, + pruneStats.ObservedSourceZombieMarkedSegments, + pruneStats.ObservedSourceZombieMarkedBytes, + pruneStats.ObservedSourceLiveSkippedSegments, + pruneStats.ObservedSourceLiveSkippedBytes, + pruneStats.ObservedSourceInUseSkippedSegments, + pruneStats.ObservedSourceInUseSkippedBytes, + pruneStats.ObservedSourceParseSkippedSegments, + pruneStats.ObservedSourceParseSkippedBytes, + pruneStats.RemovedSegments, + pruneStats.RemovedBytes, + pruneStats.ZombieMarkedSegments, + pruneStats.ZombieMarkedBytes, + pruneStats.LiveSkippedSegments, + pruneStats.LiveSkippedBytes, + pruneStats.AbortedForegroundWrites, + ) + // Refresh protected path sets after inline retained prune so + // the follow-up GC pass evaluates updated retention state. + gcOpts = db.valueLogGCOptions(false) + if len(processedRewriteIDs) > 0 { + gcOpts.ObservedSourceFileIDs = append([]uint32(nil), processedRewriteIDs...) + } + gcStatsAfterPrune, gcErr := runGC("post_retained_prune") + if gcErr != nil { + return fmt.Errorf("generational gc after retained prune: %w", gcErr) + } + gcStats = gcStatsAfterPrune + } + } + if len(processedRewriteIDs) > 0 && + gcStats.ObservedSourceSegments > 0 && + gcStats.ObservedSourceSegmentsProtectedRetained > 0 && + gcStats.ObservedSourceSegmentsEligible == 0 { + // Rewrite-selected source segments remained retained-protected + // after in-pass prune/GC. Queue an observed-source replay GC for + // the next maintenance pass. + db.queueVlogGenerationObservedSourceGCList(processedRewriteIDs) + db.vlogGenerationCheckpointKickPending.Store(true) + } + if gcStats.BytesProtectedRetained > 0 && gcStats.BytesEligible == 0 && db.valueLogRetainedClosedBytes.Load() > 0 { + // Retained-path protection can starve live reclaim even when rewrite + // processed stale payload in-pass. Kick an eager retained prune so + // lifecycle pins can drain without waiting for byte-pressure gates. + if len(processedRewriteIDs) > 0 { + db.queueRetainedPruneObservedSourceIDs(processedRewriteIDs) } + db.scheduleRetainedValueLogPruneForce() } - db.debugVlogMaintf("gc_after_rewrite_done reason=%s dur_ms=%.3f", vlogGenerationReasonString(reason), float64(time.Since(gcStart).Microseconds())/1000) + } + if effectiveBytesBefore > effectiveBytesAfter { + db.vlogGenerationRewriteReclaimedBytes.Add(uint64(effectiveBytesBefore - effectiveBytesAfter)) } locallyEffectiveProcessedDebt := len(processedRewriteIDs) > 0 && processedLedgerOK && processedLedgerStaleBytes > 0 && stats.RecordsCopied > 0 + if processedLedgerOK { + if processedLedgerLiveBytes > 0 { + db.vlogGenerationRewriteProcessedLiveBytes.Add(uint64(processedLedgerLiveBytes)) + } + if processedLedgerStaleBytes > 0 { + db.vlogGenerationRewriteProcessedStaleBytes.Add(uint64(processedLedgerStaleBytes)) + } + } if effectiveBytesBefore > 0 && effectiveBytesAfter >= effectiveBytesBefore && !locallyEffectiveProcessedDebt { db.vlogGenerationRewriteIneffectiveRuns.Add(1) db.vlogGenerationRewriteIneffectiveBytesIn.Add(uint64(effectiveBytesBefore)) @@ -13778,6 +15015,12 @@ planned: } } if locallyEffectiveProcessedDebt { + if effectiveBytesAfter >= effectiveBytesBefore { + db.vlogGenerationRewriteNoReclaimRuns.Add(1) + if processedLedgerStaleBytes > 0 { + db.vlogGenerationRewriteNoReclaimStaleBytes.Add(uint64(processedLedgerStaleBytes)) + } + } db.debugVlogMaintf( "rewrite_effective_local reason=%s processed_ids=%d planned_total=%d planned_live=%d planned_stale=%d global_bytes_before=%d global_bytes_after=%d gc_bytes_deleted=%d records=%d", vlogGenerationReasonString(reason), @@ -13818,6 +15061,33 @@ planned: } db.vlogGenerationSchedulerState.Store(vlogGenerationSchedulerIdle) db.vlogGenerationRewriteRuns.Add(1) + if sourceSegments := len(rewriteOpts.SourceFileIDs); sourceSegments > 0 { + db.vlogGenerationRewriteExecSourceSegments.Add(uint64(sourceSegments)) + } + sourceSegmentsRequested := uint64(0) + if stats.SourceSegmentsRequested > 0 { + sourceSegmentsRequested = uint64(stats.SourceSegmentsRequested) + } + sourceSegmentsStillReferenced := uint64(0) + if stats.SourceSegmentsStillReferenced > 0 { + sourceSegmentsStillReferenced = uint64(stats.SourceSegmentsStillReferenced) + } + sourceSegmentsUnreferenced := uint64(0) + if stats.SourceSegmentsUnreferenced > 0 { + sourceSegmentsUnreferenced = uint64(stats.SourceSegmentsUnreferenced) + } + db.vlogGenerationRewriteSourceSegmentsRequestedLast.Store(sourceSegmentsRequested) + db.vlogGenerationRewriteSourceSegmentsStillReferencedLast.Store(sourceSegmentsStillReferenced) + db.vlogGenerationRewriteSourceSegmentsUnreferencedLast.Store(sourceSegmentsUnreferenced) + if sourceSegmentsRequested > 0 { + db.vlogGenerationRewriteSourceSegmentsRequestedTotal.Add(sourceSegmentsRequested) + } + if sourceSegmentsStillReferenced > 0 { + db.vlogGenerationRewriteSourceSegmentsStillReferencedTotal.Add(sourceSegmentsStillReferenced) + } + if sourceSegmentsUnreferenced > 0 { + db.vlogGenerationRewriteSourceSegmentsUnreferencedTotal.Add(sourceSegmentsUnreferenced) + } rewriteBytesIn := int64(0) if processedLedgerOK { rewriteBytesIn = processedLedgerLiveBytes @@ -13851,6 +15121,18 @@ planned: if stats.RecordsCopied > 0 { db.vlogGenerationRemapSuccesses.Add(uint64(stats.RecordsCopied)) } + if stats.ValueRecordsCopied > 0 { + db.vlogGenerationRewriteValueRecordsCopied.Add(uint64(stats.ValueRecordsCopied)) + } + if stats.ValueBytesCopied > 0 { + db.vlogGenerationRewriteValueBytesCopied.Add(uint64(stats.ValueBytesCopied)) + } + if stats.LeafRefRecordsCopied > 0 { + db.vlogGenerationRewriteLeafRefRecordsCopied.Add(uint64(stats.LeafRefRecordsCopied)) + } + if stats.LeafRefBytesCopied > 0 { + db.vlogGenerationRewriteLeafRefBytesCopied.Add(uint64(stats.LeafRefBytesCopied)) + } if consumed > 0 { db.vlogGenerationConsumeRewriteBudgetBytes(consumed) } @@ -13877,29 +15159,98 @@ planned: return } + observedSourceGCIDs := db.takeVlogGenerationObservedSourceGCList() + forceObservedSourceGC := len(observedSourceGCIDs) > 0 + if !runGC && opts.bypassQuiet && !forceObservedSourceGC { + // Checkpoint-kick/deferred retry passes are rewrite-priority. Do not run + // opportunistic GC here unless we are replaying observed-source IDs from + // a prior rewrite/GC cycle. + return + } if envBool(envDisableVlogGenerationGC) { + db.debugVlogMaintf( + "gc_skip reason=disabled_env run_gc=%t force_observed=%t observed_ids=%d", + runGC, + forceObservedSourceGC, + len(observedSourceGCIDs), + ) + if forceObservedSourceGC { + queuedIDs, droppedIDs := db.retryVlogGenerationObservedSourceGCList(observedSourceGCIDs) + db.debugVlogMaintf( + "gc_observed_retry reason=disabled_env observed_ids=%d queued_ids=%d dropped_ids=%d", + len(observedSourceGCIDs), + queuedIDs, + droppedIDs, + ) + } return } // GC is a best-effort background maintenance task. It requires a checkpoint // barrier to be safe, and that barrier can be very expensive during sustained // ingest/restore when the flush queue is non-empty. Avoid introducing long // stalls by only running the GC path when the cached write queue is drained. - if queueLen != 0 { + if queueLen != 0 && !forceObservedSourceGC { + db.debugVlogMaintf( + "gc_skip reason=queue_not_drained run_gc=%t queue_len=%d force_observed=%t", + runGC, + queueLen, + forceObservedSourceGC, + ) return } gcer, ok := db.backend.(backendValueLogGCer) if !ok { + db.debugVlogMaintf( + "gc_skip reason=backend_no_gcer run_gc=%t force_observed=%t observed_ids=%d", + runGC, + forceObservedSourceGC, + len(observedSourceGCIDs), + ) + if forceObservedSourceGC { + queuedIDs, droppedIDs := db.retryVlogGenerationObservedSourceGCList(observedSourceGCIDs) + db.debugVlogMaintf( + "gc_observed_retry reason=backend_no_gcer observed_ids=%d queued_ids=%d dropped_ids=%d", + len(observedSourceGCIDs), + queuedIDs, + droppedIDs, + ) + } return } - needEligibilityEstimate := !runGC && !db.shouldRunVlogGenerationGC(retained, reclaimable, churnBps) + // Retry-driven checkpoint/deferred passes are rewrite-priority paths. Avoid + // issuing GC dry-run scans there; let periodic/manual GC decide eligibility. + needEligibilityEstimate := !runGC && !opts.bypassQuiet && !forceObservedSourceGC && !db.shouldRunVlogGenerationGC(retained, reclaimable, churnBps) now = time.Now() lastGC := db.vlogGenerationLastGCUnixNano.Load() if lastGC > 0 { lastAt := time.Unix(0, lastGC) - if now.Sub(lastAt) < vlogGenerationGCMinInterval { + if !forceObservedSourceGC && now.Sub(lastAt) < vlogGenerationGCMinInterval { + db.debugVlogMaintf( + "gc_skip reason=min_interval run_gc=%t force_observed=%t observed_ids=%d since_ms=%.3f min_ms=%.3f", + runGC, + forceObservedSourceGC, + len(observedSourceGCIDs), + float64(now.Sub(lastAt).Microseconds())/1000, + float64(vlogGenerationGCMinInterval.Microseconds())/1000, + ) return } } + if !forceObservedSourceGC { + lastNoop := db.vlogGenerationLastGCNoopUnixNano.Load() + if lastNoop > 0 { + lastNoopAt := time.Unix(0, lastNoop) + if now.Sub(lastNoopAt) < vlogGenerationGCNoopMinInterval { + db.debugVlogMaintf( + "gc_skip reason=noop_cooldown run_gc=%t since_ms=%.3f min_ms=%.3f", + runGC, + float64(now.Sub(lastNoopAt).Microseconds())/1000, + float64(vlogGenerationGCNoopMinInterval.Microseconds())/1000, + ) + return + } + } + } db.vlogGenerationSchedulerState.Store(vlogGenerationSchedulerRunning) db.vlogGenerationLastReason.Store(vlogGenerationReasonPeriodicGC) err = db.runWithBackendMaintenanceOptions(backendMaintenanceOptions{ @@ -13912,18 +15263,104 @@ planned: return fmt.Errorf("generational gc dry-run: %w", err) } if gcStats.BytesEligible < vlogGenerationGCMinBytes && gcStats.SegmentsEligible == 0 { + db.debugVlogMaintf( + "gc_skip reason=below_eligibility_floor run_gc=%t force_observed=%t eligible_bytes=%d eligible_segments=%d min_bytes=%d", + runGC, + forceObservedSourceGC, + gcStats.BytesEligible, + gcStats.SegmentsEligible, + vlogGenerationGCMinBytes, + ) return nil } } now := time.Now() db.vlogGenerationLastGCUnixNano.Store(now.UnixNano()) - ctx, cancel := db.foregroundMaintenanceContext(30 * time.Second) - gcOpts := backenddb.ValueLogGCOptions{ProtectedPaths: db.valueLogProtectedPaths()} + ctx, cancel := db.vlogGenerationMaintenanceContext(30*time.Second, opts) + gcOpts := db.valueLogGCOptions(false) + if forceObservedSourceGC { + gcOpts.ObservedSourceFileIDs = append([]uint32(nil), observedSourceGCIDs...) + db.vlogGenerationObservedGCRuns.Add(1) + } + db.debugVlogMaintf( + "gc_run start run_gc=%t force_observed=%t observed_ids=%d need_estimate=%t", + runGC, + forceObservedSourceGC, + len(observedSourceGCIDs), + needEligibilityEstimate, + ) + gcStart := time.Now() gcStats, err := gcer.ValueLogGC(ctx, gcOpts) cancel() + db.observeVlogGenerationGCExecDuration(time.Since(gcStart)) if err != nil { + db.debugVlogMaintf( + "gc_run err run_gc=%t force_observed=%t observed_ids=%d err=%v", + runGC, + forceObservedSourceGC, + len(observedSourceGCIDs), + err, + ) return fmt.Errorf("generational gc: %w", err) } + db.debugVlogMaintf( + "gc_run done run_gc=%t force_observed=%t observed_ids=%d deleted_segments=%d deleted_bytes=%d protected_retained_bytes=%d observed_segments=%d observed_eligible=%d observed_deleted=%d observed_protected_retained=%d", + runGC, + forceObservedSourceGC, + len(observedSourceGCIDs), + gcStats.SegmentsDeleted, + gcStats.BytesDeleted, + gcStats.BytesProtectedRetained, + gcStats.ObservedSourceSegments, + gcStats.ObservedSourceSegmentsEligible, + gcStats.ObservedSourceSegmentsDeleted, + gcStats.ObservedSourceSegmentsProtectedRetained, + ) + db.observeVlogGenerationGCStats(gcStats) + if !forceObservedSourceGC && + gcStats.BytesDeleted == 0 && + gcStats.SegmentsDeleted == 0 && + gcStats.BytesEligible == 0 && + gcStats.SegmentsEligible == 0 { + db.vlogGenerationLastGCNoopUnixNano.Store(now.UnixNano()) + } else { + db.vlogGenerationLastGCNoopUnixNano.Store(0) + } + if gcStats.BytesProtectedRetained > 0 && gcStats.BytesEligible == 0 && db.valueLogRetainedClosedBytes.Load() > 0 { + // When GC classifies all reclaim blockers as retained-path protection, + // trigger an eager retained prune pass to release stale lifecycle pins. + if forceObservedSourceGC { + db.queueRetainedPruneObservedSourceIDs(observedSourceGCIDs) + } + db.scheduleRetainedValueLogPruneForce() + } + if forceObservedSourceGC && + gcStats.ObservedSourceSegments > 0 && + gcStats.ObservedSourceSegmentsProtectedRetained > 0 && + gcStats.ObservedSourceSegmentsEligible == 0 { + db.debugVlogMaintf( + "gc_observed_retry reason=retained_protected observed_ids=%d observed_segments=%d observed_protected_retained=%d observed_eligible=%d", + len(observedSourceGCIDs), + gcStats.ObservedSourceSegments, + gcStats.ObservedSourceSegmentsProtectedRetained, + gcStats.ObservedSourceSegmentsEligible, + ) + db.queueRetainedPruneObservedSourceIDs(observedSourceGCIDs) + db.scheduleRetainedValueLogPruneForce() + queuedIDs, droppedIDs := db.retryVlogGenerationObservedSourceGCList(observedSourceGCIDs) + if queuedIDs > 0 { + db.vlogGenerationCheckpointKickPending.Store(true) + } + db.debugVlogMaintf( + "gc_observed_retry_result reason=retained_protected observed_ids=%d queued_ids=%d dropped_ids=%d max_attempts=%d", + len(observedSourceGCIDs), + queuedIDs, + droppedIDs, + vlogGenerationObservedGCRetryMaxAttempts, + ) + } else if forceObservedSourceGC { + db.finalizeVlogGenerationObservedSourceGCIDs(observedSourceGCIDs, false) + } db.vlogGenerationSchedulerState.Store(vlogGenerationSchedulerIdle) db.vlogGenerationGCRuns.Add(1) if gcStats.SegmentsDeleted > 0 { @@ -13935,6 +15372,22 @@ planned: return nil }) if err != nil { + db.debugVlogMaintf( + "gc_maintenance_err run_gc=%t force_observed=%t observed_ids=%d err=%v", + runGC, + forceObservedSourceGC, + len(observedSourceGCIDs), + err, + ) + if forceObservedSourceGC { + queuedIDs, droppedIDs := db.retryVlogGenerationObservedSourceGCList(observedSourceGCIDs) + db.debugVlogMaintf( + "gc_observed_retry reason=gc_error observed_ids=%d queued_ids=%d dropped_ids=%d", + len(observedSourceGCIDs), + queuedIDs, + droppedIDs, + ) + } if errors.Is(err, context.Canceled) { db.vlogGenerationSchedulerState.Store(vlogGenerationSchedulerIdle) return @@ -14012,6 +15465,34 @@ func (db *DB) maybeKickVlogGenerationMaintenanceAfterCheckpoint() { return } now := time.Now() + rewriteDisabled := envBool(envDisableVlogGenerationRewrite) + rewriteQueueLen := 0 + if !rewriteDisabled { + rewriteQueue, qerr := db.currentVlogGenerationRewriteQueue() + if qerr != nil { + db.vlogGenerationSchedulerState.Store(vlogGenerationSchedulerError) + if db.notifyError != nil { + db.notifyError(fmt.Errorf("cachingdb: load generational rewrite queue for checkpoint kick: %w", qerr)) + } + return + } + rewriteQueueLen = len(rewriteQueue) + } + if envBool(envEnableVlogGenerationCheckpointKickHotDebtOnly) && !rewriteDisabled { + quiet := db.foregroundActivityQuietFor(now, vlogGenerationMaintenanceQuietWindow, vlogForegroundReadQuietWindow) + if !quiet && rewriteQueueLen == 0 && !db.vlogGenerationDeferredMaintenanceDue(now) { + db.vlogGenerationCheckpointKickSkippedHotNoDebt.Add(1) + db.debugVlogMaintf( + "checkpoint_kick_skip reason=foreground_hot_no_debt quiet=%t queue_len=%d checkpoint_pending=%t deferred_pending=%t deferred_due=%t", + quiet, + rewriteQueueLen, + db.vlogGenerationCheckpointKickPending.Load(), + db.vlogGenerationDeferredMaintenancePending.Load(), + db.vlogGenerationDeferredMaintenanceDue(now), + ) + return + } + } last := db.vlogGenerationLastCheckpointKickUnixNano.Load() if last > 0 && now.Sub(time.Unix(0, last)) < vlogGenerationCheckpointKickMinInterval { db.debugVlogMaintf( @@ -14023,16 +15504,8 @@ func (db *DB) maybeKickVlogGenerationMaintenanceAfterCheckpoint() { } // Avoid forcing extra checkpoint boundaries when rewrite is clearly ineligible. // Skip this fast-path when rewrite is disabled so GC-only kicks still run. - if !envBool(envDisableVlogGenerationRewrite) { - rewriteQueue, qerr := db.currentVlogGenerationRewriteQueue() - if qerr != nil { - db.vlogGenerationSchedulerState.Store(vlogGenerationSchedulerError) - if db.notifyError != nil { - db.notifyError(fmt.Errorf("cachingdb: load generational rewrite queue for checkpoint kick: %w", qerr)) - } - return - } - if len(rewriteQueue) == 0 { + if !rewriteDisabled { + if rewriteQueueLen == 0 { if trigger := db.valueLogRewriteTriggerBytes; trigger > 0 { retained, bytes := db.valueLogRetainedStats() if bytes < trigger && retained < 2 { @@ -14100,6 +15573,7 @@ func (db *DB) maybeRunVlogGenerationIndexVacuum(rewriteBytesIn int64) { return } if envBool(envDisableVlogGenerationVacuum) { + db.vlogGenerationVacuumSkippedDisabled.Add(1) return } vacuumer, ok := db.backend.(backendIndexVacuumer) @@ -14119,11 +15593,13 @@ func (db *DB) maybeRunVlogGenerationIndexVacuum(rewriteBytesIn int64) { return err } var err error + vacuumStart := time.Now() if db.maintenanceActive.Load() { err = runVacuum() } else { err = db.runWithBackendMaintenance(runVacuum) } + db.observeVlogGenerationVacuumExecDuration(time.Since(vacuumStart)) if err != nil { db.vlogGenerationVacuumFailures.Add(1) db.vlogGenerationSchedulerState.Store(vlogGenerationSchedulerError) @@ -14142,12 +15618,14 @@ func (db *DB) shouldRunVlogGenerationIndexVacuum(rewriteBytesIn int64, now time. return false } if rewriteBytesIn < vlogGenerationVacuumTriggerRewriteBytes { + db.vlogGenerationVacuumSkippedRewriteBytes.Add(1) return false } last := db.vlogGenerationLastVacuumUnixNano.Load() if last > 0 { lastAt := time.Unix(0, last) if now.Sub(lastAt) < vlogGenerationVacuumMinInterval { + db.vlogGenerationVacuumSkippedCooldown.Add(1) return false } } @@ -14176,10 +15654,7 @@ func (db *DB) estimateVlogGenerationGCEligible(gcer backendValueLogGCer) (backen } ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() - stats, err := gcer.ValueLogGC(ctx, backenddb.ValueLogGCOptions{ - DryRun: true, - ProtectedPaths: db.valueLogProtectedPaths(), - }) + stats, err := gcer.ValueLogGC(ctx, db.valueLogGCOptions(true)) if err == nil { db.vlogGenerationLastGCDryRunUnixNano.Store(time.Now().UnixNano()) db.vlogGenerationLastGCDryRunBytesEligible.Store(stats.BytesEligible) @@ -14234,11 +15709,10 @@ func (db *DB) vlogGenerationRewriteMinStaleRatioForGenericPass(totalBytes int64) if totalBytes < vlogGenerationRewriteEfficacyMinTotalBytes { return 0 } - ratio := vlogGenerationRewriteGenericMinSegmentStaleRatio - if configured := db.vlogGenerationRewriteMinStaleRatioForStaleRatioTrigger(totalBytes); configured > ratio { - ratio = configured + if configured := db.vlogGenerationRewriteMinStaleRatioForStaleRatioTrigger(totalBytes); configured > 0 { + return configured } - return ratio + return vlogGenerationRewriteGenericMinSegmentStaleRatio } func (db *DB) vlogGenerationRewriteMinStaleRatioForQueuedDebt(totalBytes int64, reason uint32) float64 { @@ -19432,10 +20906,176 @@ func (db *DB) Stats() map[string]string { db.vlogGenerationRewriteQueueMu.Lock() rewriteQueueLen := len(db.vlogGenerationRewriteQueue) rewriteQueueLoaded := db.vlogGenerationRewriteQueueLoaded + rewriteLedgerSegments := len(db.vlogGenerationRewriteLedger) + rewritePenaltiesActive := len(db.vlogGenerationRewritePenalties) + rewriteStagePending := db.vlogGenerationRewriteStagePending + rewriteStageObservedNS := db.vlogGenerationRewriteStageObservedUnixNano + rewriteLedgerBytesTotal := int64(0) + rewriteLedgerBytesLive := int64(0) + rewriteLedgerBytesStale := int64(0) + for i := range db.vlogGenerationRewriteLedger { + seg := db.vlogGenerationRewriteLedger[i] + if seg.BytesTotal > 0 { + rewriteLedgerBytesTotal += seg.BytesTotal + } + if seg.BytesLive > 0 { + rewriteLedgerBytesLive += seg.BytesLive + } + if seg.BytesStale > 0 { + rewriteLedgerBytesStale += seg.BytesStale + } + } db.vlogGenerationRewriteQueueMu.Unlock() + db.vlogGenerationObservedGCMu.Lock() + observedGCPending := len(db.vlogGenerationObservedGCSourceIDs) + db.vlogGenerationObservedGCMu.Unlock() + observedGCLatencyCompleted := db.vlogGenerationObservedGCLatencyCompletedIDs.Load() + observedGCLatencyDropped := db.vlogGenerationObservedGCLatencyDroppedIDs.Load() + observedGCLatencyTotalMS := db.vlogGenerationObservedGCLatencyTotalMS.Load() + observedGCLatencyAvgMS := 0.0 + if totalObservedGCLatencyIDs := observedGCLatencyCompleted + observedGCLatencyDropped; totalObservedGCLatencyIDs > 0 { + observedGCLatencyAvgMS = float64(observedGCLatencyTotalMS) / float64(totalObservedGCLatencyIDs) + } + rewriteAgeBlockedUntilNS := db.vlogGenerationRewriteAgeBlockedUntilNS.Load() + rewriteAgeBlockedRemainingMS := int64(0) + if rewriteAgeBlockedUntilNS > 0 { + if d := time.Until(time.Unix(0, rewriteAgeBlockedUntilNS)); d > 0 { + rewriteAgeBlockedRemainingMS = d.Milliseconds() + } + } + rewriteBudgetTokens := db.vlogGenerationRewriteBudgetTokensBytes.Load() + if rewriteBudgetTokens < 0 { + rewriteBudgetTokens = 0 + } + rewriteBudgetCap := db.vlogGenerationRewriteBudgetCapBytes() + if rewriteBudgetCap < 0 { + rewriteBudgetCap = 0 + } + rewriteBudgetUtilPct := 0.0 + if rewriteBudgetCap > 0 { + rewriteBudgetUtilPct = (float64(rewriteBudgetTokens) / float64(rewriteBudgetCap)) * 100.0 + if rewriteBudgetUtilPct > 100.0 { + rewriteBudgetUtilPct = 100.0 + } + } + maintenancePassTotalNS := db.vlogGenerationMaintenancePassTotalNanos.Load() + maintenancePassMaxNS := db.vlogGenerationMaintenancePassMaxNanos.Load() + maintenancePasses := db.vlogGenerationMaintenanceAcquired.Load() + rewritePlanTotalNS := db.vlogGenerationRewritePlanTotalNanos.Load() + rewritePlanMaxNS := db.vlogGenerationRewritePlanMaxNanos.Load() + rewritePlanRuns := db.vlogGenerationRewritePlanRuns.Load() + rewriteExecTotalNS := db.vlogGenerationRewriteExecTotalNanos.Load() + rewriteExecMaxNS := db.vlogGenerationRewriteExecMaxNanos.Load() + rewriteRuns := db.vlogGenerationRewriteRuns.Load() + rewriteBytesInTotal := db.vlogGenerationRewriteBytesIn.Load() + rewriteBytesOutTotal := db.vlogGenerationRewriteBytesOut.Load() + rewriteReclaimedBytesTotal := db.vlogGenerationRewriteReclaimedBytes.Load() + rewriteValueRecordsCopiedTotal := db.vlogGenerationRewriteValueRecordsCopied.Load() + rewriteValueBytesCopiedTotal := db.vlogGenerationRewriteValueBytesCopied.Load() + rewriteLeafRefRecordsCopiedTotal := db.vlogGenerationRewriteLeafRefRecordsCopied.Load() + rewriteLeafRefBytesCopiedTotal := db.vlogGenerationRewriteLeafRefBytesCopied.Load() + rewriteProcessedLiveBytes := db.vlogGenerationRewriteProcessedLiveBytes.Load() + rewriteProcessedStaleBytes := db.vlogGenerationRewriteProcessedStaleBytes.Load() + rewriteProcessedTotal := rewriteProcessedLiveBytes + rewriteProcessedStaleBytes + rewriteBudgetConsumedTotal := db.vlogGenerationRewriteBudgetConsumed.Load() + rewriteChurnBps := db.vlogGenerationLastChurnBps.Load() + rewriteExecSeconds := 0.0 + if rewriteExecTotalNS > 0 { + rewriteExecSeconds = float64(rewriteExecTotalNS) / float64(time.Second) + } + rewriteBytesInPerSec := 0.0 + rewriteBytesOutPerSec := 0.0 + rewriteReclaimedBytesPerSec := 0.0 + rewriteBudgetConsumedPerSec := 0.0 + if rewriteExecSeconds > 0 { + rewriteBytesInPerSec = float64(rewriteBytesInTotal) / rewriteExecSeconds + rewriteBytesOutPerSec = float64(rewriteBytesOutTotal) / rewriteExecSeconds + rewriteReclaimedBytesPerSec = float64(rewriteReclaimedBytesTotal) / rewriteExecSeconds + rewriteBudgetConsumedPerSec = float64(rewriteBudgetConsumedTotal) / rewriteExecSeconds + } + rewriteOutputRatio := 0.0 + rewriteReclaimRatio := 0.0 + if rewriteBytesInTotal > 0 { + rewriteOutputRatio = float64(rewriteBytesOutTotal) / float64(rewriteBytesInTotal) + rewriteReclaimRatio = float64(rewriteReclaimedBytesTotal) / float64(rewriteBytesInTotal) + } + rewriteProcessedStaleRatio := 0.0 + if rewriteProcessedTotal > 0 { + rewriteProcessedStaleRatio = float64(rewriteProcessedStaleBytes) / float64(rewriteProcessedTotal) + } + rewriteBudgetConsumedSharePct := 0.0 + if db.valueLogRewriteBudgetBytes > 0 { + rewriteBudgetConsumedSharePct = (rewriteBudgetConsumedPerSec / float64(db.valueLogRewriteBudgetBytes)) * 100.0 + } + rewriteReclaimedVsChurnRatio := 0.0 + if rewriteChurnBps > 0 { + rewriteReclaimedVsChurnRatio = rewriteReclaimedBytesPerSec / float64(rewriteChurnBps) + } + gcExecTotalNS := db.vlogGenerationGCExecTotalNanos.Load() + gcExecMaxNS := db.vlogGenerationGCExecMaxNanos.Load() + gcRuns := db.vlogGenerationGCRuns.Load() + vacuumExecTotalNS := db.vlogGenerationVacuumExecTotalNanos.Load() + vacuumExecMaxNS := db.vlogGenerationVacuumExecMaxNanos.Load() + vacuumRuns := db.vlogGenerationVacuumRuns.Load() stats["treedb.cache.vlog_retained_segments"] = fmt.Sprintf("%d", vlogSegments) stats["treedb.cache.vlog_retained_bytes_estimate"] = fmt.Sprintf("%d", vlogBytes) stats["treedb.process.memory.vlog_retained_bytes_estimate"] = fmt.Sprintf("%d", vlogBytes) + stats["treedb.cache.vlog_retained_prune.closed_bytes"] = fmt.Sprintf("%d", db.valueLogRetainedClosedBytes.Load()) + stats["treedb.cache.vlog_retained_prune.last_unix_nano"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastUnixNano.Load()) + stats["treedb.cache.vlog_retained_prune.runs"] = fmt.Sprintf("%d", db.retainedValueLogPruneRuns.Load()) + stats["treedb.cache.vlog_retained_prune.forced_runs"] = fmt.Sprintf("%d", db.retainedValueLogPruneForcedRuns.Load()) + stats["treedb.cache.vlog_retained_prune.foreground_abort_runs"] = fmt.Sprintf("%d", db.retainedValueLogPruneForegroundAbortRuns.Load()) + stats["treedb.cache.vlog_retained_prune.removed_segments"] = fmt.Sprintf("%d", db.retainedValueLogPruneRemovedSegments.Load()) + stats["treedb.cache.vlog_retained_prune.removed_bytes"] = fmt.Sprintf("%d", db.retainedValueLogPruneRemovedBytes.Load()) + stats["treedb.cache.vlog_retained_prune.in_use_skipped_segments"] = fmt.Sprintf("%d", db.retainedValueLogPruneInUseSkippedSegments.Load()) + stats["treedb.cache.vlog_retained_prune.in_use_skipped_bytes"] = fmt.Sprintf("%d", db.retainedValueLogPruneInUseSkippedBytes.Load()) + stats["treedb.cache.vlog_retained_prune.candidate_segments"] = fmt.Sprintf("%d", db.retainedValueLogPruneCandidateSegments.Load()) + stats["treedb.cache.vlog_retained_prune.candidate_bytes"] = fmt.Sprintf("%d", db.retainedValueLogPruneCandidateBytes.Load()) + stats["treedb.cache.vlog_retained_prune.live_skipped_segments"] = fmt.Sprintf("%d", db.retainedValueLogPruneLiveSkippedSegments.Load()) + stats["treedb.cache.vlog_retained_prune.live_skipped_bytes"] = fmt.Sprintf("%d", db.retainedValueLogPruneLiveSkippedBytes.Load()) + stats["treedb.cache.vlog_retained_prune.parse_skipped_segments"] = fmt.Sprintf("%d", db.retainedValueLogPruneParseSkippedSegments.Load()) + stats["treedb.cache.vlog_retained_prune.parse_skipped_bytes"] = fmt.Sprintf("%d", db.retainedValueLogPruneParseSkippedBytes.Load()) + stats["treedb.cache.vlog_retained_prune.zombie_marked_segments"] = fmt.Sprintf("%d", db.retainedValueLogPruneZombieMarkedSegments.Load()) + stats["treedb.cache.vlog_retained_prune.zombie_marked_bytes"] = fmt.Sprintf("%d", db.retainedValueLogPruneZombieMarkedBytes.Load()) + stats["treedb.cache.vlog_retained_prune.last_observed_source.segments"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceSegments.Load()) + stats["treedb.cache.vlog_retained_prune.last_observed_source.bytes"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceBytes.Load()) + stats["treedb.cache.vlog_retained_prune.last_observed_source.segments_candidate"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceCandidateSegments.Load()) + stats["treedb.cache.vlog_retained_prune.last_observed_source.bytes_candidate"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceCandidateBytes.Load()) + stats["treedb.cache.vlog_retained_prune.last_observed_source.segments_removed"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceRemovedSegments.Load()) + stats["treedb.cache.vlog_retained_prune.last_observed_source.bytes_removed"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceRemovedBytes.Load()) + stats["treedb.cache.vlog_retained_prune.last_observed_source.segments_in_use_skipped"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceInUseSkippedSegments.Load()) + stats["treedb.cache.vlog_retained_prune.last_observed_source.bytes_in_use_skipped"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceInUseSkippedBytes.Load()) + stats["treedb.cache.vlog_retained_prune.last_observed_source.segments_live_skipped"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceLiveSkippedSegments.Load()) + stats["treedb.cache.vlog_retained_prune.last_observed_source.bytes_live_skipped"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceLiveSkippedBytes.Load()) + stats["treedb.cache.vlog_retained_prune.last_observed_source.segments_parse_skipped"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceParseSkippedSegments.Load()) + stats["treedb.cache.vlog_retained_prune.last_observed_source.bytes_parse_skipped"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceParseSkippedBytes.Load()) + stats["treedb.cache.vlog_retained_prune.last_observed_source.segments_zombie_marked"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceZombieMarkedSegments.Load()) + stats["treedb.cache.vlog_retained_prune.last_observed_source.bytes_zombie_marked"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceZombieMarkedBytes.Load()) + stats["treedb.cache.vlog_retained_prune.observed_source.segments_total"] = fmt.Sprintf("%d", db.retainedValueLogPruneObservedSourceSegmentsTotal.Load()) + stats["treedb.cache.vlog_retained_prune.observed_source.bytes_total"] = fmt.Sprintf("%d", db.retainedValueLogPruneObservedSourceBytesTotal.Load()) + stats["treedb.cache.vlog_retained_prune.observed_source.segments_candidate_total"] = fmt.Sprintf("%d", db.retainedValueLogPruneObservedSourceCandidateSegmentsTotal.Load()) + stats["treedb.cache.vlog_retained_prune.observed_source.bytes_candidate_total"] = fmt.Sprintf("%d", db.retainedValueLogPruneObservedSourceCandidateBytesTotal.Load()) + stats["treedb.cache.vlog_retained_prune.observed_source.segments_removed_total"] = fmt.Sprintf("%d", db.retainedValueLogPruneObservedSourceRemovedSegmentsTotal.Load()) + stats["treedb.cache.vlog_retained_prune.observed_source.bytes_removed_total"] = fmt.Sprintf("%d", db.retainedValueLogPruneObservedSourceRemovedBytesTotal.Load()) + stats["treedb.cache.vlog_retained_prune.observed_source.segments_in_use_skipped_total"] = fmt.Sprintf("%d", db.retainedValueLogPruneObservedSourceInUseSkippedSegmentsTotal.Load()) + stats["treedb.cache.vlog_retained_prune.observed_source.bytes_in_use_skipped_total"] = fmt.Sprintf("%d", db.retainedValueLogPruneObservedSourceInUseSkippedBytesTotal.Load()) + stats["treedb.cache.vlog_retained_prune.observed_source.segments_live_skipped_total"] = fmt.Sprintf("%d", db.retainedValueLogPruneObservedSourceLiveSkippedSegmentsTotal.Load()) + stats["treedb.cache.vlog_retained_prune.observed_source.bytes_live_skipped_total"] = fmt.Sprintf("%d", db.retainedValueLogPruneObservedSourceLiveSkippedBytesTotal.Load()) + stats["treedb.cache.vlog_retained_prune.observed_source.segments_parse_skipped_total"] = fmt.Sprintf("%d", db.retainedValueLogPruneObservedSourceParseSkippedSegmentsTotal.Load()) + stats["treedb.cache.vlog_retained_prune.observed_source.bytes_parse_skipped_total"] = fmt.Sprintf("%d", db.retainedValueLogPruneObservedSourceParseSkippedBytesTotal.Load()) + stats["treedb.cache.vlog_retained_prune.observed_source.segments_zombie_marked_total"] = fmt.Sprintf("%d", db.retainedValueLogPruneObservedSourceZombieMarkedSegmentsTotal.Load()) + stats["treedb.cache.vlog_retained_prune.observed_source.bytes_zombie_marked_total"] = fmt.Sprintf("%d", db.retainedValueLogPruneObservedSourceZombieMarkedBytesTotal.Load()) + stats["treedb.cache.vlog_retained_prune.pressure_bytes"] = fmt.Sprintf("%d", db.retainedPrunePressureBytes()) + stats["treedb.cache.vlog_retained_prune.schedule_requests"] = fmt.Sprintf("%d", db.retainedValueLogPruneScheduleRequests.Load()) + stats["treedb.cache.vlog_retained_prune.schedule_forced_requests"] = fmt.Sprintf("%d", db.retainedValueLogPruneScheduleForcedRequests.Load()) + stats["treedb.cache.vlog_retained_prune.schedule_skip.closing"] = fmt.Sprintf("%d", db.retainedValueLogPruneScheduleSkipClosing.Load()) + stats["treedb.cache.vlog_retained_prune.schedule_skip.inflight"] = fmt.Sprintf("%d", db.retainedValueLogPruneScheduleSkipInFlight.Load()) + stats["treedb.cache.vlog_retained_prune.schedule_skip.no_closed_bytes"] = fmt.Sprintf("%d", db.retainedValueLogPruneScheduleSkipNoClosedBytes.Load()) + stats["treedb.cache.vlog_retained_prune.schedule_skip.below_pressure"] = fmt.Sprintf("%d", db.retainedValueLogPruneScheduleSkipBelowPressure.Load()) + stats["treedb.cache.vlog_retained_prune.schedule_skip.min_interval"] = fmt.Sprintf("%d", db.retainedValueLogPruneScheduleSkipMinInterval.Load()) + stats["treedb.cache.vlog_retained_prune.write_gate_retries"] = fmt.Sprintf("%d", db.retainedValueLogPruneWriteGateRetries.Load()) + stats["treedb.cache.vlog_retained_prune.write_gate_retry_successes"] = fmt.Sprintf("%d", db.retainedValueLogPruneWriteGateRetrySuccesses.Load()) + stats["treedb.cache.vlog_retained_prune.force_pending"] = fmt.Sprintf("%t", db.retainedPruneForceRequested.Load()) stats["treedb.cache.vlog_generation.policy"] = fmt.Sprintf("%d", db.valueLogGenerationPolicy) stats["treedb.cache.vlog_generation.enabled"] = fmt.Sprintf("%t", db.valueLogGenerationPolicy == uint8(backenddb.ValueLogGenerationHotWarmCold)) stats["treedb.cache.vlog_generation.maintenance_phase"] = maintenancePhaseString(db.maintenancePhase.Load()) @@ -19447,18 +21087,63 @@ func (db *DB) Stats() map[string]string { stats["treedb.cache.vlog_generation.checkpoint_kick.runs"] = fmt.Sprintf("%d", db.vlogGenerationCheckpointKickRuns.Load()) stats["treedb.cache.vlog_generation.checkpoint_kick.rewrite_runs"] = fmt.Sprintf("%d", db.vlogGenerationCheckpointKickRewriteRuns.Load()) stats["treedb.cache.vlog_generation.checkpoint_kick.gc_runs"] = fmt.Sprintf("%d", db.vlogGenerationCheckpointKickGCRuns.Load()) + stats["treedb.cache.vlog_generation.checkpoint_kick.skipped_hot_no_debt"] = fmt.Sprintf("%d", db.vlogGenerationCheckpointKickSkippedHotNoDebt.Load()) + stats["treedb.cache.vlog_generation.maintenance.attempts"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceAttempts.Load()) + stats["treedb.cache.vlog_generation.maintenance.acquired"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceAcquired.Load()) + stats["treedb.cache.vlog_generation.maintenance.collisions"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceCollisions.Load()) + stats["treedb.cache.vlog_generation.maintenance.skip.wal_on_periodic"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceSkipWALOnPeriodic.Load()) + stats["treedb.cache.vlog_generation.maintenance.skip.maintenance_phase"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceSkipPhase.Load()) + stats["treedb.cache.vlog_generation.maintenance.skip.stage_gate"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceSkipStageGate.Load()) + stats["treedb.cache.vlog_generation.maintenance.skip.stage_gate_not_due"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceSkipStageNotDue.Load()) + stats["treedb.cache.vlog_generation.maintenance.skip.stage_gate_due_reserved"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceSkipStageDue.Load()) + stats["treedb.cache.vlog_generation.maintenance.skip.age_blocked_gate"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceSkipAgeBlocked.Load()) + stats["treedb.cache.vlog_generation.maintenance.skip.priority_pending"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceSkipPriority.Load()) + stats["treedb.cache.vlog_generation.maintenance.skip.quiet_window"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceSkipQuiet.Load()) + stats["treedb.cache.vlog_generation.maintenance.skip.before_first_checkpoint"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceSkipPreCheckpoint.Load()) + stats["treedb.cache.vlog_generation.maintenance.skip.checkpoint_inflight"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceSkipCheckpointing.Load()) + stats["treedb.cache.vlog_generation.maintenance.passes.noop"] = fmt.Sprintf("%d", db.vlogGenerationMaintenancePassNoop.Load()) + stats["treedb.cache.vlog_generation.maintenance.passes.with_rewrite"] = fmt.Sprintf("%d", db.vlogGenerationMaintenancePassWithRewrite.Load()) + stats["treedb.cache.vlog_generation.maintenance.passes.with_gc"] = fmt.Sprintf("%d", db.vlogGenerationMaintenancePassWithGC.Load()) + stats["treedb.cache.vlog_generation.maintenance.pass.total_ms"] = fmt.Sprintf("%.3f", float64(maintenancePassTotalNS)/float64(time.Millisecond)) + stats["treedb.cache.vlog_generation.maintenance.pass.max_ms"] = fmt.Sprintf("%.3f", float64(maintenancePassMaxNS)/float64(time.Millisecond)) + if maintenancePasses > 0 { + stats["treedb.cache.vlog_generation.maintenance.pass.avg_ms"] = fmt.Sprintf("%.3f", (float64(maintenancePassTotalNS)/float64(maintenancePasses))/float64(time.Millisecond)) + } else { + stats["treedb.cache.vlog_generation.maintenance.pass.avg_ms"] = "0.000" + } stats["treedb.cache.vlog_generation.churn_bytes_total"] = fmt.Sprintf("%d", db.vlogGenerationChurnBytes.Load()) - stats["treedb.cache.vlog_generation.churn_bytes_per_sec"] = fmt.Sprintf("%d", db.vlogGenerationLastChurnBps.Load()) + stats["treedb.cache.vlog_generation.churn_bytes_per_sec"] = fmt.Sprintf("%d", rewriteChurnBps) stats["treedb.cache.vlog_generation.rewrite.queue_len"] = fmt.Sprintf("%d", rewriteQueueLen) stats["treedb.cache.vlog_generation.rewrite.queue_loaded"] = fmt.Sprintf("%t", rewriteQueueLoaded) + stats["treedb.cache.vlog_generation.rewrite.ledger_segments"] = fmt.Sprintf("%d", rewriteLedgerSegments) + stats["treedb.cache.vlog_generation.rewrite.ledger_bytes_total"] = fmt.Sprintf("%d", rewriteLedgerBytesTotal) + stats["treedb.cache.vlog_generation.rewrite.ledger_bytes_live"] = fmt.Sprintf("%d", rewriteLedgerBytesLive) + stats["treedb.cache.vlog_generation.rewrite.ledger_bytes_stale"] = fmt.Sprintf("%d", rewriteLedgerBytesStale) + if rewriteLedgerBytesTotal > 0 { + stats["treedb.cache.vlog_generation.rewrite.ledger_stale_ratio_ppm"] = fmt.Sprintf("%d", (rewriteLedgerBytesStale*1_000_000)/rewriteLedgerBytesTotal) + } else { + stats["treedb.cache.vlog_generation.rewrite.ledger_stale_ratio_ppm"] = "0" + } + stats["treedb.cache.vlog_generation.rewrite.stage_pending"] = fmt.Sprintf("%t", rewriteStagePending) + stats["treedb.cache.vlog_generation.rewrite.stage_observed_unix_nano"] = fmt.Sprintf("%d", rewriteStageObservedNS) + stats["treedb.cache.vlog_generation.rewrite.penalties_active"] = fmt.Sprintf("%d", rewritePenaltiesActive) + stats["treedb.cache.vlog_generation.rewrite.age_blocked_until_unix_nano"] = fmt.Sprintf("%d", rewriteAgeBlockedUntilNS) + stats["treedb.cache.vlog_generation.rewrite.age_blocked_remaining_ms"] = fmt.Sprintf("%d", rewriteAgeBlockedRemainingMS) stats["treedb.cache.vlog_generation.hot.segment_target_bytes"] = fmt.Sprintf("%d", db.valueLogGenerationHotTarget) stats["treedb.cache.vlog_generation.warm.segment_target_bytes"] = fmt.Sprintf("%d", db.valueLogGenerationWarmTarget) stats["treedb.cache.vlog_generation.cold.segment_target_bytes"] = fmt.Sprintf("%d", db.valueLogGenerationColdTarget) stats["treedb.cache.vlog_generation.rewrite_budget.bytes_per_sec"] = fmt.Sprintf("%d", db.valueLogRewriteBudgetBytes) stats["treedb.cache.vlog_generation.rewrite_budget.records_per_sec"] = fmt.Sprintf("%d", db.valueLogRewriteBudgetRecords) + stats["treedb.cache.vlog_generation.rewrite_budget.tokens_bytes"] = fmt.Sprintf("%d", rewriteBudgetTokens) + stats["treedb.cache.vlog_generation.rewrite_budget.tokens_cap_bytes"] = fmt.Sprintf("%d", rewriteBudgetCap) + stats["treedb.cache.vlog_generation.rewrite_budget.tokens_utilization_pct"] = fmt.Sprintf("%.3f", rewriteBudgetUtilPct) + stats["treedb.cache.vlog_generation.rewrite_budget.consumed_bytes_total"] = fmt.Sprintf("%d", rewriteBudgetConsumedTotal) + stats["treedb.cache.vlog_generation.rewrite_budget.consumed_bytes_per_sec"] = fmt.Sprintf("%.3f", rewriteBudgetConsumedPerSec) + stats["treedb.cache.vlog_generation.rewrite_budget.consumed_share_of_budget_pct"] = fmt.Sprintf("%.3f", rewriteBudgetConsumedSharePct) stats["treedb.cache.vlog_generation.rewrite_trigger.stale_ratio_ppm"] = fmt.Sprintf("%d", db.valueLogRewriteTriggerRatioPPM) stats["treedb.cache.vlog_generation.rewrite_trigger.total_bytes"] = fmt.Sprintf("%d", db.valueLogRewriteTriggerBytes) stats["treedb.cache.vlog_generation.rewrite_trigger.churn_per_sec"] = fmt.Sprintf("%d", db.valueLogRewriteTriggerChurn) + stats["treedb.cache.vlog_generation.rewrite.min_segment_age_ms"] = fmt.Sprintf("%d", db.valueLogRewriteMinSegmentAge.Milliseconds()) // PR1 scaffolding: legacy allocator still owns placement; report retained // totals under hot generation until generation-aware allocator lands. stats["treedb.cache.vlog_generation.bytes.live.total"] = fmt.Sprintf("%d", retained.BytesTotal) @@ -19477,17 +21162,82 @@ func (db *DB) Stats() map[string]string { stats["treedb.cache.vlog_generation.segments.hot"] = fmt.Sprintf("%d", retained.SegmentsHot) stats["treedb.cache.vlog_generation.segments.warm"] = fmt.Sprintf("%d", retained.SegmentsWarm) stats["treedb.cache.vlog_generation.segments.cold"] = fmt.Sprintf("%d", retained.SegmentsCold) - stats["treedb.cache.vlog_generation.rewrite.bytes_in"] = fmt.Sprintf("%d", db.vlogGenerationRewriteBytesIn.Load()) - stats["treedb.cache.vlog_generation.rewrite.bytes_out"] = fmt.Sprintf("%d", db.vlogGenerationRewriteBytesOut.Load()) + stats["treedb.cache.vlog_generation.rewrite.bytes_in"] = fmt.Sprintf("%d", rewriteBytesInTotal) + stats["treedb.cache.vlog_generation.rewrite.bytes_out"] = fmt.Sprintf("%d", rewriteBytesOutTotal) + stats["treedb.cache.vlog_generation.rewrite.value_records_copied"] = fmt.Sprintf("%d", rewriteValueRecordsCopiedTotal) + stats["treedb.cache.vlog_generation.rewrite.value_bytes_copied"] = fmt.Sprintf("%d", rewriteValueBytesCopiedTotal) + stats["treedb.cache.vlog_generation.rewrite.leafref_records_copied"] = fmt.Sprintf("%d", rewriteLeafRefRecordsCopiedTotal) + stats["treedb.cache.vlog_generation.rewrite.leafref_bytes_copied"] = fmt.Sprintf("%d", rewriteLeafRefBytesCopiedTotal) + stats["treedb.cache.vlog_generation.rewrite.processed_live_bytes"] = fmt.Sprintf("%d", rewriteProcessedLiveBytes) + stats["treedb.cache.vlog_generation.rewrite.processed_stale_bytes"] = fmt.Sprintf("%d", rewriteProcessedStaleBytes) + stats["treedb.cache.vlog_generation.rewrite.reclaim_ratio"] = fmt.Sprintf("%.6f", rewriteReclaimRatio) + stats["treedb.cache.vlog_generation.rewrite.output_ratio"] = fmt.Sprintf("%.6f", rewriteOutputRatio) + stats["treedb.cache.vlog_generation.rewrite.processed_stale_ratio"] = fmt.Sprintf("%.6f", rewriteProcessedStaleRatio) + stats["treedb.cache.vlog_generation.rewrite.exec.bytes_in_per_sec"] = fmt.Sprintf("%.3f", rewriteBytesInPerSec) + stats["treedb.cache.vlog_generation.rewrite.exec.bytes_out_per_sec"] = fmt.Sprintf("%.3f", rewriteBytesOutPerSec) + stats["treedb.cache.vlog_generation.rewrite.exec.reclaimed_bytes_per_sec"] = fmt.Sprintf("%.3f", rewriteReclaimedBytesPerSec) + stats["treedb.cache.vlog_generation.rewrite.exec.reclaimed_vs_churn_ratio"] = fmt.Sprintf("%.6f", rewriteReclaimedVsChurnRatio) + stats["treedb.cache.vlog_generation.rewrite.no_reclaim_runs"] = fmt.Sprintf("%d", db.vlogGenerationRewriteNoReclaimRuns.Load()) + stats["treedb.cache.vlog_generation.rewrite.no_reclaim_stale_bytes"] = fmt.Sprintf("%d", db.vlogGenerationRewriteNoReclaimStaleBytes.Load()) stats["treedb.cache.vlog_generation.rewrite.runs"] = fmt.Sprintf("%d", db.vlogGenerationRewriteRuns.Load()) stats["treedb.cache.vlog_generation.rewrite.plan_runs"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanRuns.Load()) stats["treedb.cache.vlog_generation.rewrite.plan_canceled"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanCanceled.Load()) stats["treedb.cache.vlog_generation.rewrite.plan_canceled_last_unix_nano"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanCanceledLastNS.Load()) stats["treedb.cache.vlog_generation.rewrite.plan_errors"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanErrors.Load()) stats["treedb.cache.vlog_generation.rewrite.plan_empty"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanEmpty.Load()) + stats["treedb.cache.vlog_generation.rewrite.plan_empty.age_blocked"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanEmptyAgeBlocked.Load()) + stats["treedb.cache.vlog_generation.rewrite.plan_empty.no_selection"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanEmptyNoSelection.Load()) stats["treedb.cache.vlog_generation.rewrite.plan_selected"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanSelected.Load()) + stats["treedb.cache.vlog_generation.rewrite.plan_selected_segments_total"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanSelectedSegments.Load()) + stats["treedb.cache.vlog_generation.rewrite.plan_selected_bytes_total"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanSelectedBytes.Load()) + stats["treedb.cache.vlog_generation.rewrite.plan_selected_bytes_live"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanSelectedLiveBytes.Load()) + stats["treedb.cache.vlog_generation.rewrite.plan_selected_bytes_stale"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanSelectedStaleBytes.Load()) + stats["treedb.cache.vlog_generation.rewrite.plan_penalty_filter.runs"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanPenaltyFilterRuns.Load()) + stats["treedb.cache.vlog_generation.rewrite.plan_penalty_filter.segments"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanPenaltyFilterSegments.Load()) + stats["treedb.cache.vlog_generation.rewrite.plan_penalty_filter.to_empty_runs"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanPenaltyFilterToEmpty.Load()) + stats["treedb.cache.vlog_generation.observed_gc.pending_ids"] = fmt.Sprintf("%d", observedGCPending) + stats["treedb.cache.vlog_generation.observed_gc.queued_batches"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCQueuedBatches.Load()) + stats["treedb.cache.vlog_generation.observed_gc.queued_ids"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCQueuedIDs.Load()) + stats["treedb.cache.vlog_generation.observed_gc.taken_batches"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCTakenBatches.Load()) + stats["treedb.cache.vlog_generation.observed_gc.taken_ids"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCTakenIDs.Load()) + stats["treedb.cache.vlog_generation.observed_gc.runs"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCRuns.Load()) + stats["treedb.cache.vlog_generation.observed_gc.retry_queued"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCRetryQueued.Load()) + stats["treedb.cache.vlog_generation.observed_gc.retry_dropped"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCRetryDropped.Load()) + stats["treedb.cache.vlog_generation.observed_gc.retry_max_attempts"] = fmt.Sprintf("%d", vlogGenerationObservedGCRetryMaxAttempts) + stats["treedb.cache.vlog_generation.observed_gc.latency.completed_ids"] = fmt.Sprintf("%d", observedGCLatencyCompleted) + stats["treedb.cache.vlog_generation.observed_gc.latency.dropped_ids"] = fmt.Sprintf("%d", observedGCLatencyDropped) + stats["treedb.cache.vlog_generation.observed_gc.latency.total_ms"] = fmt.Sprintf("%d", observedGCLatencyTotalMS) + stats["treedb.cache.vlog_generation.observed_gc.latency.max_ms"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCLatencyMaxMS.Load()) + stats["treedb.cache.vlog_generation.observed_gc.latency.avg_ms"] = fmt.Sprintf("%.3f", observedGCLatencyAvgMS) + stats["treedb.cache.vlog_generation.observed_gc.source_segments_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceSegmentsTotal.Load()) + stats["treedb.cache.vlog_generation.observed_gc.source_segments_eligible_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceSegmentsEligibleTotal.Load()) + stats["treedb.cache.vlog_generation.observed_gc.source_segments_deleted_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceSegmentsDeletedTotal.Load()) + stats["treedb.cache.vlog_generation.observed_gc.source_segments_protected_in_use_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceSegmentsProtectedInUseTotal.Load()) + stats["treedb.cache.vlog_generation.observed_gc.source_segments_protected_retained_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceSegmentsProtectedRetainedTotal.Load()) + stats["treedb.cache.vlog_generation.observed_gc.source_segments_protected_overlap_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceSegmentsProtectedOverlapTotal.Load()) + stats["treedb.cache.vlog_generation.observed_gc.source_segments_protected_other_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceSegmentsProtectedOtherTotal.Load()) + stats["treedb.cache.vlog_generation.observed_gc.source_bytes_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceBytesTotal.Load()) + stats["treedb.cache.vlog_generation.observed_gc.source_bytes_eligible_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceBytesEligibleTotal.Load()) + stats["treedb.cache.vlog_generation.observed_gc.source_bytes_deleted_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceBytesDeletedTotal.Load()) + stats["treedb.cache.vlog_generation.observed_gc.source_bytes_protected_in_use_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceBytesProtectedInUseTotal.Load()) + stats["treedb.cache.vlog_generation.observed_gc.source_bytes_protected_retained_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceBytesProtectedRetainedTotal.Load()) + stats["treedb.cache.vlog_generation.observed_gc.source_bytes_protected_overlap_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceBytesProtectedOverlapTotal.Load()) + stats["treedb.cache.vlog_generation.observed_gc.source_bytes_protected_other_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceBytesProtectedOtherTotal.Load()) + stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_total"] = fmt.Sprintf("%d", db.vlogGenerationRewriteExecSourceSegments.Load()) + stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_requested_total"] = fmt.Sprintf("%d", db.vlogGenerationRewriteSourceSegmentsRequestedTotal.Load()) + stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_still_referenced_total"] = fmt.Sprintf("%d", db.vlogGenerationRewriteSourceSegmentsStillReferencedTotal.Load()) + stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_unreferenced_total"] = fmt.Sprintf("%d", db.vlogGenerationRewriteSourceSegmentsUnreferencedTotal.Load()) + stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_requested_last"] = fmt.Sprintf("%d", db.vlogGenerationRewriteSourceSegmentsRequestedLast.Load()) + stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_still_referenced_last"] = fmt.Sprintf("%d", db.vlogGenerationRewriteSourceSegmentsStillReferencedLast.Load()) + stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_unreferenced_last"] = fmt.Sprintf("%d", db.vlogGenerationRewriteSourceSegmentsUnreferencedLast.Load()) stats["treedb.cache.vlog_generation.rewrite.canceled_runs"] = fmt.Sprintf("%d", db.vlogGenerationRewriteCanceledRuns.Load()) + stats["treedb.cache.vlog_generation.rewrite.canceled_runs.fresh_plan"] = fmt.Sprintf("%d", db.vlogGenerationRewriteCanceledFreshPlanRuns.Load()) + stats["treedb.cache.vlog_generation.rewrite.canceled_runs.queued_debt"] = fmt.Sprintf("%d", db.vlogGenerationRewriteCanceledQueuedDebtRuns.Load()) stats["treedb.cache.vlog_generation.rewrite.canceled_last_unix_nano"] = fmt.Sprintf("%d", db.vlogGenerationRewriteCanceledLastNS.Load()) + stats["treedb.cache.vlog_generation.rewrite.deadline_runs"] = fmt.Sprintf("%d", db.vlogGenerationRewriteDeadlineRuns.Load()) + stats["treedb.cache.vlog_generation.rewrite.deadline_runs.fresh_plan"] = fmt.Sprintf("%d", db.vlogGenerationRewriteDeadlineFreshPlanRuns.Load()) + stats["treedb.cache.vlog_generation.rewrite.deadline_runs.queued_debt"] = fmt.Sprintf("%d", db.vlogGenerationRewriteDeadlineQueuedDebtRuns.Load()) + stats["treedb.cache.vlog_generation.rewrite.deadline_last_unix_nano"] = fmt.Sprintf("%d", db.vlogGenerationRewriteDeadlineLastNS.Load()) stats["treedb.cache.vlog_generation.rewrite.queue_prune_runs"] = fmt.Sprintf("%d", db.vlogGenerationRewriteQueuePruneRuns.Load()) stats["treedb.cache.vlog_generation.rewrite.queue_prune_ids"] = fmt.Sprintf("%d", db.vlogGenerationRewriteQueuePruneIDs.Load()) stats["treedb.cache.vlog_generation.rewrite.ineffective_runs"] = fmt.Sprintf("%d", db.vlogGenerationRewriteIneffectiveRuns.Load()) @@ -19495,17 +21245,91 @@ func (db *DB) Stats() map[string]string { stats["treedb.cache.vlog_generation.rewrite.ineffective_bytes_out"] = fmt.Sprintf("%d", db.vlogGenerationRewriteIneffectiveBytesOut.Load()) stats["treedb.cache.vlog_generation.rewrite.ineffective_last_unix_nano"] = fmt.Sprintf("%d", db.vlogGenerationRewriteIneffectiveLastNS.Load()) stats["treedb.cache.vlog_generation.rewrite.ineffective_backoff_seconds"] = fmt.Sprintf("%.0f", vlogGenerationRewriteIneffectiveBackoff.Seconds()) + stats["treedb.cache.vlog_generation.rewrite.reclaimed_bytes"] = fmt.Sprintf("%d", db.vlogGenerationRewriteReclaimedBytes.Load()) + stats["treedb.cache.vlog_generation.rewrite.plan.total_ms"] = fmt.Sprintf("%.3f", float64(rewritePlanTotalNS)/float64(time.Millisecond)) + stats["treedb.cache.vlog_generation.rewrite.plan.max_ms"] = fmt.Sprintf("%.3f", float64(rewritePlanMaxNS)/float64(time.Millisecond)) + if rewritePlanRuns > 0 { + stats["treedb.cache.vlog_generation.rewrite.plan.avg_ms"] = fmt.Sprintf("%.3f", (float64(rewritePlanTotalNS)/float64(rewritePlanRuns))/float64(time.Millisecond)) + } else { + stats["treedb.cache.vlog_generation.rewrite.plan.avg_ms"] = "0.000" + } + stats["treedb.cache.vlog_generation.rewrite.exec.total_ms"] = fmt.Sprintf("%.3f", float64(rewriteExecTotalNS)/float64(time.Millisecond)) + stats["treedb.cache.vlog_generation.rewrite.exec.max_ms"] = fmt.Sprintf("%.3f", float64(rewriteExecMaxNS)/float64(time.Millisecond)) + if rewriteRuns > 0 { + stats["treedb.cache.vlog_generation.rewrite.exec.avg_ms"] = fmt.Sprintf("%.3f", (float64(rewriteExecTotalNS)/float64(rewriteRuns))/float64(time.Millisecond)) + } else { + stats["treedb.cache.vlog_generation.rewrite.exec.avg_ms"] = "0.000" + } stats["treedb.cache.vlog_generation.rewrite.plan_last_unix_nano"] = fmt.Sprintf("%d", db.vlogGenerationLastRewritePlanUnixNano.Load()) stats["treedb.cache.vlog_generation.rewrite.last_unix_nano"] = fmt.Sprintf("%d", db.vlogGenerationLastRewriteUnixNano.Load()) stats["treedb.cache.vlog_generation.gc.deleted_segments"] = fmt.Sprintf("%d", db.vlogGenerationGCSegmentsDeleted.Load()) stats["treedb.cache.vlog_generation.gc.deleted_bytes"] = fmt.Sprintf("%d", db.vlogGenerationGCBytesDeleted.Load()) + stats["treedb.cache.vlog_generation.gc.last_referenced_segments"] = fmt.Sprintf("%d", db.vlogGenerationLastGCSegmentsReferenced.Load()) + stats["treedb.cache.vlog_generation.gc.last_referenced_bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCBytesReferenced.Load()) + stats["treedb.cache.vlog_generation.gc.last_active_segments"] = fmt.Sprintf("%d", db.vlogGenerationLastGCSegmentsActive.Load()) + stats["treedb.cache.vlog_generation.gc.last_active_bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCBytesActive.Load()) + stats["treedb.cache.vlog_generation.gc.last_protected_segments"] = fmt.Sprintf("%d", db.vlogGenerationLastGCSegmentsProtected.Load()) + stats["treedb.cache.vlog_generation.gc.last_protected_bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCBytesProtected.Load()) + stats["treedb.cache.vlog_generation.gc.last_protected_in_use_segments"] = fmt.Sprintf("%d", db.vlogGenerationLastGCSegmentsProtectedInUse.Load()) + stats["treedb.cache.vlog_generation.gc.last_protected_in_use_bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCBytesProtectedInUse.Load()) + stats["treedb.cache.vlog_generation.gc.last_protected_retained_segments"] = fmt.Sprintf("%d", db.vlogGenerationLastGCSegmentsProtectedRetained.Load()) + stats["treedb.cache.vlog_generation.gc.last_protected_retained_bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCBytesProtectedRetained.Load()) + stats["treedb.cache.vlog_generation.gc.last_protected_overlap_segments"] = fmt.Sprintf("%d", db.vlogGenerationLastGCSegmentsProtectedOverlap.Load()) + stats["treedb.cache.vlog_generation.gc.last_protected_overlap_bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCBytesProtectedOverlap.Load()) + stats["treedb.cache.vlog_generation.gc.last_protected_other_segments"] = fmt.Sprintf("%d", db.vlogGenerationLastGCSegmentsProtectedOther.Load()) + stats["treedb.cache.vlog_generation.gc.last_protected_other_bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCBytesProtectedOther.Load()) + stats["treedb.cache.vlog_generation.gc.last_eligible_segments"] = fmt.Sprintf("%d", db.vlogGenerationLastGCSegmentsEligible.Load()) + stats["treedb.cache.vlog_generation.gc.last_eligible_bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCBytesEligible.Load()) + stats["treedb.cache.vlog_generation.gc.last_deleted_segments"] = fmt.Sprintf("%d", db.vlogGenerationLastGCSegmentsDeleted.Load()) + stats["treedb.cache.vlog_generation.gc.last_deleted_bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCBytesDeleted.Load()) + stats["treedb.cache.vlog_generation.gc.last_pending_segments"] = fmt.Sprintf("%d", db.vlogGenerationLastGCSegmentsPending.Load()) + stats["treedb.cache.vlog_generation.gc.last_pending_bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCBytesPending.Load()) + stats["treedb.cache.vlog_generation.gc.last_observed_source.segments"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceSegments.Load()) + stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_referenced"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceSegmentsReferenced.Load()) + stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_active"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceSegmentsActive.Load()) + stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_protected"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceSegmentsProtected.Load()) + stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_protected_in_use"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceSegmentsProtectedInUse.Load()) + stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_protected_retained"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceSegmentsProtectedRetained.Load()) + stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_protected_overlap"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceSegmentsProtectedOverlap.Load()) + stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_protected_other"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceSegmentsProtectedOther.Load()) + stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_eligible"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceSegmentsEligible.Load()) + stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_deleted"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceSegmentsDeleted.Load()) + stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_pending"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceSegmentsPending.Load()) + stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceBytes.Load()) + stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_referenced"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceBytesReferenced.Load()) + stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_active"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceBytesActive.Load()) + stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_protected"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceBytesProtected.Load()) + stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_protected_in_use"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceBytesProtectedInUse.Load()) + stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_protected_retained"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceBytesProtectedRetained.Load()) + stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_protected_overlap"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceBytesProtectedOverlap.Load()) + stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_protected_other"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceBytesProtectedOther.Load()) + stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_eligible"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceBytesEligible.Load()) + stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_deleted"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceBytesDeleted.Load()) + stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_pending"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceBytesPending.Load()) stats["treedb.cache.vlog_generation.gc.runs"] = fmt.Sprintf("%d", db.vlogGenerationGCRuns.Load()) + stats["treedb.cache.vlog_generation.gc.exec.total_ms"] = fmt.Sprintf("%.3f", float64(gcExecTotalNS)/float64(time.Millisecond)) + stats["treedb.cache.vlog_generation.gc.exec.max_ms"] = fmt.Sprintf("%.3f", float64(gcExecMaxNS)/float64(time.Millisecond)) + if gcRuns > 0 { + stats["treedb.cache.vlog_generation.gc.exec.avg_ms"] = fmt.Sprintf("%.3f", (float64(gcExecTotalNS)/float64(gcRuns))/float64(time.Millisecond)) + } else { + stats["treedb.cache.vlog_generation.gc.exec.avg_ms"] = "0.000" + } stats["treedb.cache.vlog_generation.gc.last_unix_nano"] = fmt.Sprintf("%d", db.vlogGenerationLastGCUnixNano.Load()) stats["treedb.cache.vlog_generation.gc.dry_run.last_unix_nano"] = fmt.Sprintf("%d", db.vlogGenerationLastGCDryRunUnixNano.Load()) stats["treedb.cache.vlog_generation.gc.dry_run.last_eligible_bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCDryRunBytesEligible.Load()) stats["treedb.cache.vlog_generation.gc.dry_run.last_eligible_segments"] = fmt.Sprintf("%d", db.vlogGenerationLastGCDryRunSegsEligible.Load()) stats["treedb.cache.vlog_generation.vacuum.runs"] = fmt.Sprintf("%d", db.vlogGenerationVacuumRuns.Load()) stats["treedb.cache.vlog_generation.vacuum.failures"] = fmt.Sprintf("%d", db.vlogGenerationVacuumFailures.Load()) + stats["treedb.cache.vlog_generation.vacuum.skipped_disabled"] = fmt.Sprintf("%d", db.vlogGenerationVacuumSkippedDisabled.Load()) + stats["treedb.cache.vlog_generation.vacuum.skipped_rewrite_bytes"] = fmt.Sprintf("%d", db.vlogGenerationVacuumSkippedRewriteBytes.Load()) + stats["treedb.cache.vlog_generation.vacuum.skipped_cooldown"] = fmt.Sprintf("%d", db.vlogGenerationVacuumSkippedCooldown.Load()) + stats["treedb.cache.vlog_generation.vacuum.exec.total_ms"] = fmt.Sprintf("%.3f", float64(vacuumExecTotalNS)/float64(time.Millisecond)) + stats["treedb.cache.vlog_generation.vacuum.exec.max_ms"] = fmt.Sprintf("%.3f", float64(vacuumExecMaxNS)/float64(time.Millisecond)) + if vacuumRuns > 0 { + stats["treedb.cache.vlog_generation.vacuum.exec.avg_ms"] = fmt.Sprintf("%.3f", (float64(vacuumExecTotalNS)/float64(vacuumRuns))/float64(time.Millisecond)) + } else { + stats["treedb.cache.vlog_generation.vacuum.exec.avg_ms"] = "0.000" + } stats["treedb.cache.vlog_generation.vacuum.last_unix_nano"] = fmt.Sprintf("%d", db.vlogGenerationLastVacuumUnixNano.Load()) stats["treedb.cache.vlog_generation.remap.successes"] = fmt.Sprintf("%d", db.vlogGenerationRemapSuccesses.Load()) stats["treedb.cache.vlog_generation.remap.failures"] = fmt.Sprintf("%d", db.vlogGenerationRemapFailures.Load()) @@ -19654,6 +21478,16 @@ func (db *DB) Stats() map[string]string { stats["treedb.cache.vlog_mmap.dead_mappings.cap_base"] = fmt.Sprintf("%d", valuelog.MaxDeadMappings) stats["treedb.cache.vlog_mmap.max_mapped_sealed_segments"] = fmt.Sprintf("%d", valuelog.MaxMappedSealedSegments) stats["treedb.cache.vlog_mmap.max_mapped_sealed_bytes"] = fmt.Sprintf("%d", valuelog.MaxMappedSealedBytes) + zombieSegments, zombieBytes, zombiePinnedSegments, zombiePinnedBytes, zombieUnpinnedSegments, zombieUnpinnedBytes := db.valueLogReader.ZombieStats() + stats["treedb.cache.vlog_zombie.segments"] = fmt.Sprintf("%d", zombieSegments) + stats["treedb.cache.vlog_zombie.bytes"] = fmt.Sprintf("%d", zombieBytes) + stats["treedb.cache.vlog_zombie.pinned_segments"] = fmt.Sprintf("%d", zombiePinnedSegments) + stats["treedb.cache.vlog_zombie.pinned_bytes"] = fmt.Sprintf("%d", zombiePinnedBytes) + stats["treedb.cache.vlog_zombie.unpinned_segments"] = fmt.Sprintf("%d", zombieUnpinnedSegments) + stats["treedb.cache.vlog_zombie.unpinned_bytes"] = fmt.Sprintf("%d", zombieUnpinnedBytes) + stats["treedb.process.memory.vlog_zombie_bytes_estimate"] = fmt.Sprintf("%d", zombieBytes) + stats["treedb.process.memory.vlog_zombie_pinned_bytes_estimate"] = fmt.Sprintf("%d", zombiePinnedBytes) + stats["treedb.process.memory.vlog_zombie_unpinned_bytes_estimate"] = fmt.Sprintf("%d", zombieUnpinnedBytes) stats["treedb.cache.vlog_mmap.active_segments"] = fmt.Sprintf("%d", cacheVlogMmap.activeSegments) stats["treedb.cache.vlog_mmap.active_bytes"] = fmt.Sprintf("%d", cacheVlogMmap.activeBytes) stats["treedb.cache.vlog_mmap.current_segments"] = fmt.Sprintf("%d", cacheVlogMmap.currentSegments) diff --git a/TreeDB/caching/db_test.go b/TreeDB/caching/db_test.go index cb6e8adb3..9a2a4a024 100644 --- a/TreeDB/caching/db_test.go +++ b/TreeDB/caching/db_test.go @@ -1953,6 +1953,57 @@ func TestCachingDB_PrunesRetainedValueLog(t *testing.T) { } } +func TestOpen_InitializesRetainedClosedBytesFromExistingSegments(t *testing.T) { + dir := t.TempDir() + + opts := Options{ + DisableWAL: true, + RelaxedSync: true, + AllowUnsafe: true, + FlushThreshold: 1 << 20, + ValueLogPointerThreshold: 1, + } + + backend1, err := db.Open(db.Options{Dir: dir, ChunkSize: 64 * 1024}) + if err != nil { + t.Fatalf("backend1 open: %v", err) + } + cache1, err := Open(dir, backend1, opts) + if err != nil { + _ = backend1.Close() + t.Fatalf("cache1 open: %v", err) + } + + if err := cache1.Set([]byte("k"), bytes.Repeat([]byte("x"), page.DefaultInlineThreshold+256)); err != nil { + t.Fatalf("Set: %v", err) + } + cache1.flushAll(false) + if err := cache1.rotateValueLogLocked(&cache1.lanes[0]); err != nil { + t.Fatalf("rotateValueLogLocked: %v", err) + } + if got := cache1.valueLogRetainedClosedBytes.Load(); got <= 0 { + t.Fatalf("pre-close retained closed bytes=%d want >0", got) + } + if err := cache1.Close(); err != nil { + t.Fatalf("cache1 close: %v", err) + } + + backend2, err := db.Open(db.Options{Dir: dir, ChunkSize: 64 * 1024}) + if err != nil { + t.Fatalf("backend2 open: %v", err) + } + cache2, err := Open(dir, backend2, opts) + if err != nil { + _ = backend2.Close() + t.Fatalf("cache2 open: %v", err) + } + defer cache2.Close() + + if got := cache2.valueLogRetainedClosedBytes.Load(); got <= 0 { + t.Fatalf("reopen retained closed bytes=%d want >0", got) + } +} + func TestPruneRetainedValueLogs_SkipsLiveScanWhenAllRetainedPathsInUse(t *testing.T) { dir := t.TempDir() backend := NewMockBackend() @@ -1976,7 +2027,7 @@ func TestPruneRetainedValueLogs_SkipsLiveScanWhenAllRetainedPathsInUse(t *testin } cache.markValueLogRetain(retained) - cache.pruneRetainedValueLogs() + pruneStats := cache.pruneRetainedValueLogs(false) backend.mu.RLock() iteratorCalls := backend.iteratorCalls @@ -1987,6 +2038,12 @@ func TestPruneRetainedValueLogs_SkipsLiveScanWhenAllRetainedPathsInUse(t *testin if !cache.valueLogRetained(retained) { t.Fatalf("expected in-use retained path to remain retained") } + if pruneStats.InUseSkippedSegments != 1 { + t.Fatalf("InUseSkippedSegments=%d want 1", pruneStats.InUseSkippedSegments) + } + if pruneStats.CandidateSegments != 0 { + t.Fatalf("CandidateSegments=%d want 0", pruneStats.CandidateSegments) + } } func seedRetainedPrunePressure(cache *DB, retainedPath string, size int64) { @@ -2228,6 +2285,121 @@ func TestRetainedValueLogPrune_AbortsWhenForegroundWritesResume(t *testing.T) { } } +func TestRetainedValueLogPruneForce_RetriesAfterForegroundWritesResume(t *testing.T) { + dir := t.TempDir() + backend := NewMockBackend() + backend.iteratorStartedCh = make(chan struct{}) + backend.iteratorBlockCh = make(chan struct{}) + + cache, err := Open(dir, backend, Options{ + DisableWAL: true, + RelaxedSync: true, + AllowUnsafe: true, + FlushThreshold: 1 << 20, + ValueLogPointerThreshold: 1, + }) + if err != nil { + t.Fatalf("cache open: %v", err) + } + defer cache.Close() + + fileID, err := valuelog.EncodeFileID(0, 212) + if err != nil { + t.Fatalf("EncodeFileID: %v", err) + } + retainedPath := filepath.Join(dir, "wal", "value-l0-000212.log") + if err := os.MkdirAll(filepath.Dir(retainedPath), 0o755); err != nil { + t.Fatalf("MkdirAll: %v", err) + } + w, err := valuelog.NewWriter(retainedPath, fileID) + if err != nil { + t.Fatalf("NewWriter: %v", err) + } + if _, err := w.Append(0, nil, 1, bytes.Repeat([]byte("r"), 128)); err != nil { + _ = w.Close() + t.Fatalf("Append: %v", err) + } + if err := w.Close(); err != nil { + t.Fatalf("Close writer: %v", err) + } + cache.markValueLogRetain(retainedPath) + seedRetainedPrunePressure(cache, retainedPath, 2<<30) + cache.lastForegroundWriteUnixNano.Store(time.Now().Add(-2 * retainedPruneQuietWindow).UnixNano()) + + cache.scheduleRetainedValueLogPruneForce() + + select { + case <-backend.iteratorStartedCh: + case <-time.After(2 * time.Second): + t.Fatalf("forced prune did not start") + } + + lastWrite := cache.lastForegroundWriteUnixNano.Load() + deadline := time.Now().Add(2 * time.Second) + for !cache.foregroundWritesResumedSince(lastWrite) { + if time.Now().After(deadline) { + t.Fatalf("foreground write timestamp did not advance") + } + cache.noteWrite() + time.Sleep(time.Millisecond) + } + close(backend.iteratorBlockCh) + cache.waitForRetainedValueLogPrune() + + if cache.valueLogRetained(retainedPath) { + t.Fatalf("retained path still marked after forced retry prune") + } + stats := cache.Stats() + if got := stats["treedb.cache.vlog_retained_prune.forced_runs"]; got != "1" { + t.Fatalf("forced_runs=%q want 1", got) + } + if got := stats["treedb.cache.vlog_retained_prune.foreground_abort_runs"]; got != "0" { + t.Fatalf("foreground_abort_runs=%q want 0", got) + } + if got := stats["treedb.cache.vlog_retained_prune.write_gate_retries"]; got != "1" { + t.Fatalf("write_gate_retries=%q want 1", got) + } + if got := stats["treedb.cache.vlog_retained_prune.write_gate_retry_successes"]; got != "1" { + t.Fatalf("write_gate_retry_successes=%q want 1", got) + } +} + +func TestAllowValueLogPointers_HardCapRequestsForcedRetainedPrune(t *testing.T) { + cache := &DB{} + cache.testSkipRetainedPrune = true + cache.maxValueLogRetainedBytesHard = 1024 + cache.valueLogRetainedClosedBytes.Store(2048) + + if cache.allowValueLogPointers() { + t.Fatalf("allowValueLogPointers=true, want false when hard cap exceeded") + } + if got := cache.retainedValueLogPruneScheduleForcedRequests.Load(); got != 1 { + t.Fatalf("schedule_forced_requests=%d want 1 after first hard-cap crossing", got) + } + + // Re-check while still over cap should not repeatedly re-schedule until + // retained bytes drop back below the hard cap. + if cache.allowValueLogPointers() { + t.Fatalf("allowValueLogPointers=true on repeated over-cap check, want false") + } + if got := cache.retainedValueLogPruneScheduleForcedRequests.Load(); got != 1 { + t.Fatalf("schedule_forced_requests=%d want 1 after repeated over-cap check", got) + } + + cache.valueLogRetainedClosedBytes.Store(0) + if !cache.allowValueLogPointers() { + t.Fatalf("allowValueLogPointers=false, want true after dropping below hard cap") + } + + cache.valueLogRetainedClosedBytes.Store(4096) + if cache.allowValueLogPointers() { + t.Fatalf("allowValueLogPointers=true after second hard-cap crossing, want false") + } + if got := cache.retainedValueLogPruneScheduleForcedRequests.Load(); got != 2 { + t.Fatalf("schedule_forced_requests=%d want 2 after second hard-cap crossing", got) + } +} + func TestCheckpoint_RateLimitsRetainedValueLogPrune(t *testing.T) { dir := t.TempDir() backend := NewMockBackend() @@ -2361,6 +2533,145 @@ func TestCheckpoint_SkipsRetainedValueLogPruneBelowPressureThreshold(t *testing. if cache.retainedPruneActive() { cache.waitForRetainedValueLogPrune() } + stats := cache.Stats() + if got := stats["treedb.cache.vlog_retained_prune.schedule_requests"]; got != "1" { + t.Fatalf("schedule_requests=%q want 1", got) + } + if got := stats["treedb.cache.vlog_retained_prune.schedule_forced_requests"]; got != "0" { + t.Fatalf("schedule_forced_requests=%q want 0", got) + } + if got := stats["treedb.cache.vlog_retained_prune.schedule_skip.below_pressure"]; got != "1" { + t.Fatalf("schedule_skip.below_pressure=%q want 1", got) + } + if got := stats["treedb.cache.vlog_retained_prune.closed_bytes"]; got != "128" { + t.Fatalf("closed_bytes=%q want 128", got) + } +} + +func TestRetainedValueLogPruneForce_BypassesPressureThreshold(t *testing.T) { + dir := t.TempDir() + backend := NewMockBackend() + backend.iteratorStartedCh = make(chan struct{}) + backend.iteratorBlockCh = make(chan struct{}) + + cache, err := Open(dir, backend, Options{ + DisableWAL: true, + RelaxedSync: true, + AllowUnsafe: true, + FlushThreshold: 1 << 20, + MaxValueLogRetainedBytes: 1 << 20, + ValueLogPointerThreshold: 1, + }) + if err != nil { + t.Fatalf("cache open: %v", err) + } + defer cache.Close() + + fileID, err := valuelog.EncodeFileID(0, 245) + if err != nil { + t.Fatalf("EncodeFileID: %v", err) + } + retainedPath := filepath.Join(dir, "wal", "value-l0-000245.log") + if err := os.MkdirAll(filepath.Dir(retainedPath), 0o755); err != nil { + t.Fatalf("MkdirAll: %v", err) + } + w, err := valuelog.NewWriter(retainedPath, fileID) + if err != nil { + t.Fatalf("NewWriter: %v", err) + } + if _, err := w.Append(0, nil, 1, bytes.Repeat([]byte("t"), 128)); err != nil { + _ = w.Close() + t.Fatalf("Append: %v", err) + } + if err := w.Close(); err != nil { + t.Fatalf("Close writer: %v", err) + } + cache.markValueLogRetain(retainedPath) + seedRetainedPrunePressure(cache, retainedPath, 128) + cache.lastForegroundWriteUnixNano.Store(time.Now().Add(-2 * retainedPruneQuietWindow).UnixNano()) + + cache.scheduleRetainedValueLogPruneForce() + + select { + case <-backend.iteratorStartedCh: + case <-time.After(2 * time.Second): + t.Fatalf("forced retained prune did not start below pressure threshold") + } + close(backend.iteratorBlockCh) + cache.waitForRetainedValueLogPrune() + stats := cache.Stats() + if got := stats["treedb.cache.vlog_retained_prune.schedule_forced_requests"]; got != "1" { + t.Fatalf("schedule_forced_requests=%q want 1", got) + } + if got := stats["treedb.cache.vlog_retained_prune.forced_runs"]; got != "1" { + t.Fatalf("forced_runs=%q want 1", got) + } +} + +func TestRetainedValueLogPruneForce_PreemptsQuietWait(t *testing.T) { + dir := t.TempDir() + backend := NewMockBackend() + backend.iteratorStartedCh = make(chan struct{}) + backend.iteratorBlockCh = make(chan struct{}) + + cache, err := Open(dir, backend, Options{ + DisableWAL: true, + RelaxedSync: true, + AllowUnsafe: true, + FlushThreshold: 1 << 20, + ValueLogPointerThreshold: 1, + }) + if err != nil { + t.Fatalf("cache open: %v", err) + } + defer cache.Close() + + fileID, err := valuelog.EncodeFileID(0, 246) + if err != nil { + t.Fatalf("EncodeFileID: %v", err) + } + retainedPath := filepath.Join(dir, "wal", "value-l0-000246.log") + if err := os.MkdirAll(filepath.Dir(retainedPath), 0o755); err != nil { + t.Fatalf("MkdirAll: %v", err) + } + w, err := valuelog.NewWriter(retainedPath, fileID) + if err != nil { + t.Fatalf("NewWriter: %v", err) + } + if _, err := w.Append(0, nil, 1, bytes.Repeat([]byte("u"), 128)); err != nil { + _ = w.Close() + t.Fatalf("Append: %v", err) + } + if err := w.Close(); err != nil { + t.Fatalf("Close writer: %v", err) + } + cache.markValueLogRetain(retainedPath) + seedRetainedPrunePressure(cache, retainedPath, 2<<30) + cache.lastForegroundWriteUnixNano.Store(time.Now().UnixNano()) + + cache.scheduleRetainedValueLogPrune() + select { + case <-backend.iteratorStartedCh: + t.Fatalf("retained prune started before quiet window elapsed") + case <-time.After(retainedPruneNegativeAssertWait): + } + + cache.scheduleRetainedValueLogPruneForce() + + select { + case <-backend.iteratorStartedCh: + case <-time.After(2 * time.Second): + t.Fatalf("forced retained prune did not preempt quiet-window wait") + } + close(backend.iteratorBlockCh) + cache.waitForRetainedValueLogPrune() + stats := cache.Stats() + if got := stats["treedb.cache.vlog_retained_prune.schedule_forced_requests"]; got != "1" { + t.Fatalf("schedule_forced_requests=%q want 1", got) + } + if got := stats["treedb.cache.vlog_retained_prune.forced_runs"]; got != "1" { + t.Fatalf("forced_runs=%q want 1", got) + } } func TestCheckpoint_DoesNotWaitForPriorRetainedValueLogPrune(t *testing.T) { diff --git a/TreeDB/caching/expvar_stats.go b/TreeDB/caching/expvar_stats.go index 8c3205ca0..3e9f2b7ed 100644 --- a/TreeDB/caching/expvar_stats.go +++ b/TreeDB/caching/expvar_stats.go @@ -139,6 +139,8 @@ func selectTreeDBExpvarStats(stats map[string]string) map[string]any { strings.HasPrefix(k, "treedb.cache.vlog_payload_split.") || strings.HasPrefix(k, "treedb.cache.vlog_auto.") || strings.HasPrefix(k, "treedb.cache.vlog_dict.") || + strings.HasPrefix(k, "treedb.cache.vlog_generation.") || + strings.HasPrefix(k, "treedb.cache.vlog_retained_prune.") || strings.HasPrefix(k, "treedb.cache.vlog_payload_kind.") || strings.HasPrefix(k, "treedb.cache.vlog_outer_leaf_codec.") || strings.HasPrefix(k, "treedb.cache.batch_arena.") { diff --git a/TreeDB/caching/expvar_stats_test.go b/TreeDB/caching/expvar_stats_test.go index f4de57519..cc88d93ab 100644 --- a/TreeDB/caching/expvar_stats_test.go +++ b/TreeDB/caching/expvar_stats_test.go @@ -28,6 +28,8 @@ func TestSelectTreeDBExpvarStatsFiltersAndCoerces(t *testing.T) { "treedb.cache.vlog_dict.current_k": "32", "treedb.cache.vlog_payload_kind.raw_bytes.single_value": "2048", "treedb.cache.vlog_outer_leaf_codec.raw_bytes.lz4": "512", + "treedb.cache.vlog_generation.rewrite.reclaimed_bytes": "1234", + "treedb.cache.vlog_retained_prune.runs": "3", "treedb.process.memory.heap_inuse_bytes": "4096", "treedb.process.memory.pool_pressure_level": "critical", "treedb.cache.batch_arena.pool_bytes_estimate": "65536", @@ -80,6 +82,12 @@ func TestSelectTreeDBExpvarStatsFiltersAndCoerces(t *testing.T) { if v, ok := got["treedb.cache.vlog_outer_leaf_codec.raw_bytes.lz4"].(int64); !ok || v != 512 { t.Fatalf("vlog_outer_leaf_codec.raw_bytes.lz4=%T(%v) want int64(512)", got["treedb.cache.vlog_outer_leaf_codec.raw_bytes.lz4"], got["treedb.cache.vlog_outer_leaf_codec.raw_bytes.lz4"]) } + if v, ok := got["treedb.cache.vlog_generation.rewrite.reclaimed_bytes"].(int64); !ok || v != 1234 { + t.Fatalf("vlog_generation.rewrite.reclaimed_bytes=%T(%v) want int64(1234)", got["treedb.cache.vlog_generation.rewrite.reclaimed_bytes"], got["treedb.cache.vlog_generation.rewrite.reclaimed_bytes"]) + } + if v, ok := got["treedb.cache.vlog_retained_prune.runs"].(int64); !ok || v != 3 { + t.Fatalf("vlog_retained_prune.runs=%T(%v) want int64(3)", got["treedb.cache.vlog_retained_prune.runs"], got["treedb.cache.vlog_retained_prune.runs"]) + } if v, ok := got["treedb.process.memory.heap_inuse_bytes"].(int64); !ok || v != 4096 { t.Fatalf("heap_inuse_bytes=%T(%v) want int64(4096)", got["treedb.process.memory.heap_inuse_bytes"], got["treedb.process.memory.heap_inuse_bytes"]) } diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go index 0beb47e61..dc469bf50 100644 --- a/TreeDB/caching/vlog_generation_scheduler_test.go +++ b/TreeDB/caching/vlog_generation_scheduler_test.go @@ -6,6 +6,7 @@ import ( "fmt" "os" "path/filepath" + "sort" "sync" "testing" "time" @@ -273,16 +274,229 @@ func TestShouldRunVlogGenerationRewrite_NoTrigger(t *testing.T) { } } -func TestVlogGenerationRewriteMinStaleRatioForGenericPass_UsesQualityFloor(t *testing.T) { +func TestObserveVlogGenerationRewritePlanOutcome_SelectedTracksBytes(t *testing.T) { + db := &DB{} + db.observeVlogGenerationRewritePlanOutcome(backenddb.ValueLogRewritePlan{ + SourceFileIDs: []uint32{11}, + SegmentsSelected: 1, + SelectedBytesTotal: 1024, + SelectedBytesLive: 640, + SelectedBytesStale: 384, + }, nil) + if got, want := db.vlogGenerationRewritePlanRuns.Load(), uint64(1); got != want { + t.Fatalf("plan runs=%d want=%d", got, want) + } + if got, want := db.vlogGenerationRewritePlanSelected.Load(), uint64(1); got != want { + t.Fatalf("plan selected=%d want=%d", got, want) + } + if got, want := db.vlogGenerationRewritePlanSelectedBytes.Load(), uint64(1024); got != want { + t.Fatalf("plan selected bytes total=%d want=%d", got, want) + } + if got, want := db.vlogGenerationRewritePlanSelectedLiveBytes.Load(), uint64(640); got != want { + t.Fatalf("plan selected bytes live=%d want=%d", got, want) + } + if got, want := db.vlogGenerationRewritePlanSelectedStaleBytes.Load(), uint64(384); got != want { + t.Fatalf("plan selected bytes stale=%d want=%d", got, want) + } +} + +func TestObserveVlogGenerationRewritePlanOutcome_SelectedTracksSegmentFallbackBytes(t *testing.T) { + db := &DB{} + db.observeVlogGenerationRewritePlanOutcome(backenddb.ValueLogRewritePlan{ + SourceFileIDs: []uint32{11, 22}, + SegmentsSelected: 2, + SelectedSegments: []backenddb.ValueLogRewritePlanSegment{ + {FileID: 11, BytesTotal: 100, BytesLive: 25, BytesStale: 75}, + {FileID: 22, BytesTotal: 120, BytesLive: 40, BytesStale: 80}, + }, + }, nil) + if got, want := db.vlogGenerationRewritePlanSelectedBytes.Load(), uint64(220); got != want { + t.Fatalf("fallback selected bytes total=%d want=%d", got, want) + } + if got, want := db.vlogGenerationRewritePlanSelectedLiveBytes.Load(), uint64(65); got != want { + t.Fatalf("fallback selected bytes live=%d want=%d", got, want) + } + if got, want := db.vlogGenerationRewritePlanSelectedStaleBytes.Load(), uint64(155); got != want { + t.Fatalf("fallback selected bytes stale=%d want=%d", got, want) + } +} + +func TestObserveVlogGenerationRewritePlanOutcome_EmptyReasonBuckets(t *testing.T) { + db := &DB{} + db.observeVlogGenerationRewritePlanOutcome(backenddb.ValueLogRewritePlan{ + AgeBlockedSegments: 2, + AgeBlockedMinRemainingAge: 3 * time.Second, + }, nil) + db.observeVlogGenerationRewritePlanOutcome(backenddb.ValueLogRewritePlan{}, nil) + + if got, want := db.vlogGenerationRewritePlanEmpty.Load(), uint64(2); got != want { + t.Fatalf("plan empty=%d want=%d", got, want) + } + if got, want := db.vlogGenerationRewritePlanEmptyAgeBlocked.Load(), uint64(1); got != want { + t.Fatalf("plan empty age-blocked=%d want=%d", got, want) + } + if got, want := db.vlogGenerationRewritePlanEmptyNoSelection.Load(), uint64(1); got != want { + t.Fatalf("plan empty no-selection=%d want=%d", got, want) + } +} + +func TestObserveVlogGenerationRewritePlanPenaltyFilterCounters(t *testing.T) { + db := &DB{} + db.observeVlogGenerationRewritePlanPenaltyFilter(5, 2) + db.observeVlogGenerationRewritePlanPenaltyFilter(2, 0) + + if got, want := db.vlogGenerationRewritePlanPenaltyFilterRuns.Load(), uint64(2); got != want { + t.Fatalf("penalty filter runs=%d want=%d", got, want) + } + if got, want := db.vlogGenerationRewritePlanPenaltyFilterSegments.Load(), uint64(5); got != want { + t.Fatalf("penalty filter segments=%d want=%d", got, want) + } + if got, want := db.vlogGenerationRewritePlanPenaltyFilterToEmpty.Load(), uint64(1); got != want { + t.Fatalf("penalty filter to-empty=%d want=%d", got, want) + } +} + +func TestObserveVlogGenerationRewriteCanceledCountersByQueueState(t *testing.T) { + db := &DB{} + db.observeVlogGenerationRewriteCanceled(false) + db.observeVlogGenerationRewriteCanceled(true) + + if got, want := db.vlogGenerationRewriteCanceledRuns.Load(), uint64(2); got != want { + t.Fatalf("rewrite canceled total=%d want=%d", got, want) + } + if got, want := db.vlogGenerationRewriteCanceledFreshPlanRuns.Load(), uint64(1); got != want { + t.Fatalf("rewrite canceled fresh=%d want=%d", got, want) + } + if got, want := db.vlogGenerationRewriteCanceledQueuedDebtRuns.Load(), uint64(1); got != want { + t.Fatalf("rewrite canceled queued=%d want=%d", got, want) + } +} + +func TestObserveVlogGenerationRewriteDeadlineCountersByQueueState(t *testing.T) { + db := &DB{} + db.observeVlogGenerationRewriteDeadline(false) + db.observeVlogGenerationRewriteDeadline(true) + + if got, want := db.vlogGenerationRewriteDeadlineRuns.Load(), uint64(2); got != want { + t.Fatalf("rewrite deadline total=%d want=%d", got, want) + } + if got, want := db.vlogGenerationRewriteDeadlineFreshPlanRuns.Load(), uint64(1); got != want { + t.Fatalf("rewrite deadline fresh=%d want=%d", got, want) + } + if got, want := db.vlogGenerationRewriteDeadlineQueuedDebtRuns.Load(), uint64(1); got != want { + t.Fatalf("rewrite deadline queued=%d want=%d", got, want) + } +} + +func TestMaybeRunVlogGenerationMaintenanceWithOptions_TracksWalOnPeriodicSkip(t *testing.T) { + db := &DB{valueLogGenerationPolicy: uint8(backenddb.ValueLogGenerationHotWarmCold)} + db.maybeRunVlogGenerationMaintenanceWithOptions(true, vlogGenerationMaintenanceOptions{}) + if got, want := db.vlogGenerationMaintenanceAttempts.Load(), uint64(1); got != want { + t.Fatalf("maintenance attempts=%d want=%d", got, want) + } + if got, want := db.vlogGenerationMaintenanceSkipWALOnPeriodic.Load(), uint64(1); got != want { + t.Fatalf("maintenance wal-on periodic skips=%d want=%d", got, want) + } + if got := db.vlogGenerationMaintenanceAcquired.Load(); got != 0 { + t.Fatalf("maintenance acquired=%d want=0", got) + } +} + +func TestMaybeRunVlogGenerationMaintenanceWithOptions_TracksCollision(t *testing.T) { + db := &DB{valueLogGenerationPolicy: uint8(backenddb.ValueLogGenerationHotWarmCold)} + db.vlogGenerationMaintenanceActive.Store(true) + db.maybeRunVlogGenerationMaintenanceWithOptions(false, vlogGenerationMaintenanceOptions{}) + if got, want := db.vlogGenerationMaintenanceAttempts.Load(), uint64(1); got != want { + t.Fatalf("maintenance attempts=%d want=%d", got, want) + } + if got, want := db.vlogGenerationMaintenanceCollisions.Load(), uint64(1); got != want { + t.Fatalf("maintenance collisions=%d want=%d", got, want) + } + if got := db.vlogGenerationMaintenanceAcquired.Load(); got != 0 { + t.Fatalf("maintenance acquired=%d want=0", got) + } +} + +func TestShouldRunVlogGenerationIndexVacuum_TracksSkipReasons(t *testing.T) { + db := &DB{valueLogGenerationPolicy: uint8(backenddb.ValueLogGenerationHotWarmCold)} + now := time.Now() + if db.shouldRunVlogGenerationIndexVacuum(vlogGenerationVacuumTriggerRewriteBytes-1, now) { + t.Fatalf("expected vacuum to skip below rewrite trigger") + } + if got, want := db.vlogGenerationVacuumSkippedRewriteBytes.Load(), uint64(1); got != want { + t.Fatalf("vacuum skipped_rewrite_bytes=%d want=%d", got, want) + } + db.vlogGenerationLastVacuumUnixNano.Store(now.UnixNano()) + if db.shouldRunVlogGenerationIndexVacuum(vlogGenerationVacuumTriggerRewriteBytes, now) { + t.Fatalf("expected vacuum to skip during cooldown") + } + if got, want := db.vlogGenerationVacuumSkippedCooldown.Load(), uint64(1); got != want { + t.Fatalf("vacuum skipped_cooldown=%d want=%d", got, want) + } +} + +func TestMaybeRunVlogGenerationIndexVacuum_TracksDisabledSkip(t *testing.T) { + t.Setenv(envDisableVlogGenerationVacuum, "1") + db := &DB{valueLogGenerationPolicy: uint8(backenddb.ValueLogGenerationHotWarmCold)} + db.maybeRunVlogGenerationIndexVacuum(vlogGenerationVacuumTriggerRewriteBytes) + if got, want := db.vlogGenerationVacuumSkippedDisabled.Load(), uint64(1); got != want { + t.Fatalf("vacuum skipped_disabled=%d want=%d", got, want) + } +} + +func TestRunVlogGenerationMaintenanceRetries_CoalescesPendingCollisionRetries(t *testing.T) { + db := &DB{valueLogGenerationPolicy: uint8(backenddb.ValueLogGenerationHotWarmCold)} + + db.vlogGenerationMaintenanceActive.Store(true) + db.vlogGenerationCheckpointKickPending.Store(true) + db.runVlogGenerationMaintenanceRetries(vlogGenerationMaintenanceOptions{ + bypassQuiet: true, + skipRetainedPruneWait: true, + skipCheckpoint: false, + rewriteDebtDrain: true, + debugSource: "checkpoint_pending", + }, 30*time.Millisecond, false) + if got := db.vlogGenerationMaintenanceCollisions.Load(); got != 0 { + t.Fatalf("checkpoint pending retry collisions=%d want=0", got) + } + + db.vlogGenerationMaintenanceActive.Store(true) + db.vlogGenerationCheckpointKickPending.Store(false) + db.runVlogGenerationMaintenanceRetries(vlogGenerationMaintenanceOptions{ + bypassQuiet: true, + skipRetainedPruneWait: true, + skipCheckpoint: false, + rewriteDebtDrain: true, + debugSource: "checkpoint_pending", + }, 30*time.Millisecond, false) + if got := db.vlogGenerationMaintenanceCollisions.Load(); got != 0 { + t.Fatalf("checkpoint retry collisions while active=%d want=0", got) + } + + db.vlogGenerationMaintenanceActive.Store(true) + db.vlogGenerationDeferredMaintenancePending.Store(true) + db.runVlogGenerationMaintenanceRetries(vlogGenerationMaintenanceOptions{ + bypassQuiet: true, + skipRetainedPruneWait: true, + skipCheckpoint: false, + rewriteDebtDrain: true, + debugSource: "rewrite_stage_confirm", + }, 30*time.Millisecond, true) + if got := db.vlogGenerationMaintenanceCollisions.Load(); got != 0 { + t.Fatalf("deferred pending retry collisions=%d want=0", got) + } +} + +func TestVlogGenerationRewriteMinStaleRatioForGenericPass_UsesConfiguredTriggerRatio(t *testing.T) { db := &DB{valueLogRewriteTriggerRatioPPM: 200000} - if got, want := db.vlogGenerationRewriteMinStaleRatioForGenericPass(8<<30), vlogGenerationRewriteGenericMinSegmentStaleRatio; got != want { + if got, want := db.vlogGenerationRewriteMinStaleRatioForGenericPass(8<<30), 0.50; got != want { t.Fatalf("generic min stale ratio=%f want=%f", got, want) } } -func TestVlogGenerationRewriteMinStaleRatioForGenericPass_UsesHigherConfiguredRatio(t *testing.T) { +func TestVlogGenerationRewriteMinStaleRatioForGenericPass_UsesHigherConfiguredTriggerRatio(t *testing.T) { db := &DB{valueLogRewriteTriggerRatioPPM: 800000} - if got, want := db.vlogGenerationRewriteMinStaleRatioForGenericPass(8<<30), vlogGenerationRewriteGenericMinSegmentStaleRatio; got != want { + if got, want := db.vlogGenerationRewriteMinStaleRatioForGenericPass(8<<30), 0.80; got != want { t.Fatalf("generic min stale ratio=%f want=%f", got, want) } } @@ -301,9 +515,16 @@ func TestVlogGenerationRewriteMinStaleRatioForGenericPass_DisabledBelowEfficacyF } } +func TestVlogGenerationRewriteMinStaleRatioForGenericPass_DefaultWithoutConfiguredTrigger(t *testing.T) { + db := &DB{valueLogRewriteTriggerRatioPPM: 0} + if got, want := db.vlogGenerationRewriteMinStaleRatioForGenericPass(8<<30), vlogGenerationRewriteGenericMinSegmentStaleRatio; got != want { + t.Fatalf("generic min stale ratio=%f want=%f", got, want) + } +} + func TestVlogGenerationRewriteMinStaleRatioForQueuedDebt_UsesGenericFloorForTotalBytes(t *testing.T) { db := &DB{valueLogRewriteTriggerRatioPPM: 200000} - if got, want := db.vlogGenerationRewriteMinStaleRatioForQueuedDebt(8<<30, vlogGenerationReasonTotalBytes), vlogGenerationRewriteGenericMinSegmentStaleRatio; got != want { + if got, want := db.vlogGenerationRewriteMinStaleRatioForQueuedDebt(8<<30, vlogGenerationReasonTotalBytes), 0.50; got != want { t.Fatalf("queued total-bytes min stale ratio=%f want=%f", got, want) } } @@ -509,6 +730,51 @@ func TestVlogGenerationRewriteMaxSegmentsForRun_ClampsDebtDrainQueue(t *testing. } } +func TestVlogGenerationRewriteMaxSegmentsForFreshPlan_BelowQueueThreshold(t *testing.T) { + db := &DB{ + valueLogRewriteBudgetBytes: 1024, + valueLogGenerationWarmTarget: 256, + } + got := db.vlogGenerationRewriteMaxSegmentsForFreshPlan( + vlogGenerationRewriteFreshPlanDebtDrainMinSegments-1, + 1<<20, + vlogGenerationMaintenanceOptions{rewriteDebtDrain: true, debugSource: "rewrite_age_blocked"}, + ) + if got != vlogGenerationRewriteResumeMaxSegments { + t.Fatalf("fresh-plan queue 0 { + idx := b.gcCalls - 1 + if idx < 0 { + idx = 0 + } + if idx >= len(b.gcResponses) { + idx = len(b.gcResponses) - 1 + } + stats = b.gcResponses[idx] + } err := b.gcErr b.mu.Unlock() + if customFn != nil { + return customFn(ctx, opts) + } return stats, err } @@ -572,6 +856,15 @@ func cloneRewriteOptsForTest(opts backenddb.ValueLogRewriteOnlineOptions) backen return cloned } +func cloneGCOptsForTest(opts backenddb.ValueLogGCOptions) backenddb.ValueLogGCOptions { + cloned := opts + cloned.ProtectedPaths = append([]string(nil), opts.ProtectedPaths...) + cloned.ProtectedInUsePaths = append([]string(nil), opts.ProtectedInUsePaths...) + cloned.ProtectedRetainedPaths = append([]string(nil), opts.ProtectedRetainedPaths...) + cloned.ObservedSourceFileIDs = append([]uint32(nil), opts.ObservedSourceFileIDs...) + return cloned +} + func (b *rewriteBudgetRecordingBackend) recordedPlan() (backenddb.ValueLogRewriteOnlineOptions, int) { b.mu.Lock() defer b.mu.Unlock() @@ -581,7 +874,31 @@ func (b *rewriteBudgetRecordingBackend) recordedPlan() (backenddb.ValueLogRewrit func (b *rewriteBudgetRecordingBackend) recordedGC() (backenddb.ValueLogGCStats, int) { b.mu.Lock() defer b.mu.Unlock() - return b.gcResponse, b.gcCalls + stats := b.gcResponse + if len(b.gcResponses) > 0 && b.gcCalls > 0 { + idx := b.gcCalls - 1 + if idx >= len(b.gcResponses) { + idx = len(b.gcResponses) - 1 + } + stats = b.gcResponses[idx] + } + return stats, b.gcCalls +} + +func (b *rewriteBudgetRecordingBackend) recordedGCObservedSourceCalls() int { + b.mu.Lock() + defer b.mu.Unlock() + count := 0 + for _, opts := range b.gcOpts { + if opts.DryRun { + continue + } + if len(opts.ObservedSourceFileIDs) == 0 { + continue + } + count++ + } + return count } func openRewriteQueueTestDB(t *testing.T, dir string, recorder *rewriteBudgetRecordingBackend) (*DB, func()) { @@ -686,7 +1003,7 @@ func TestVlogGenerationMaintenance_SerializesConcurrentRuns(t *testing.T) { // While the first pass is still inside rewrite, a concurrent pass should be // skipped by the maintenance-active gate instead of issuing a second rewrite. - db.maybeRunVlogGenerationMaintenanceWithOptions(true, vlogGenerationMaintenanceOptions{ + db.maybeRunVlogGenerationMaintenanceWithOptions(false, vlogGenerationMaintenanceOptions{ bypassQuiet: true, skipRetainedPruneWait: true, skipCheckpoint: true, @@ -797,9 +1114,324 @@ func TestVlogGenerationRewrite_QueuedExecIgnoresForegroundCancelUntilBoundedComp if got := stats["treedb.cache.vlog_generation.rewrite.canceled_runs"]; got != "0" { t.Fatalf("rewrite canceled runs=%q want 0 for bounded queued rewrite", got) } + if got := stats["treedb.cache.vlog_generation.rewrite.canceled_runs.fresh_plan"]; got != "0" { + t.Fatalf("rewrite canceled fresh runs=%q want 0 for bounded queued rewrite", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.canceled_runs.queued_debt"]; got != "0" { + t.Fatalf("rewrite canceled queued runs=%q want 0 for bounded queued rewrite", got) + } } -func TestVlogGenerationRewrite_CanceledFreshPlanQueuesPendingResume(t *testing.T) { +func TestVlogGenerationRewrite_ObservedSourceRetainedBlock_RunsSecondGC(t *testing.T) { + prepareDirectSchedulerTest(t) + + dir := t.TempDir() + backend, err := backenddb.Open(backenddb.Options{Dir: dir}) + if err != nil { + t.Fatalf("open backend: %v", err) + } + recorder := &rewriteBudgetRecordingBackend{ + DB: backend, + rewriteResponse: backenddb.ValueLogRewriteStats{ + BytesBefore: 128, + BytesAfter: 128, + RecordsCopied: 1, + SourceSegmentsRequested: 1, + SourceSegmentsStillReferenced: 0, + SourceSegmentsUnreferenced: 1, + }, + gcResponses: []backenddb.ValueLogGCStats{ + { + BytesProtectedRetained: 64, + BytesEligible: 0, + ObservedSourceSegments: 1, + ObservedSourceSegmentsReferenced: 0, + ObservedSourceSegmentsEligible: 0, + ObservedSourceSegmentsProtectedRetained: 1, + ObservedSourceBytesProtectedRetained: 64, + }, + { + BytesProtectedRetained: 0, + BytesEligible: 64, + BytesDeleted: 64, + ObservedSourceSegments: 1, + ObservedSourceSegmentsEligible: 1, + ObservedSourceSegmentsDeleted: 1, + ObservedSourceBytesEligible: 64, + ObservedSourceBytesDeleted: 64, + }, + }, + } + + db, cleanup := openRewriteQueueTestDB(t, dir, recorder) + defer cleanup() + skipRetainedPrune(db) + + if err := db.setVlogGenerationRewriteQueue([]uint32{11}); err != nil { + t.Fatalf("seed rewrite queue: %v", err) + } + db.vlogGenerationRewriteBudgetTokensBytes.Store(1024) + forceVlogMaintenanceIdle(db) + forceRetainedPruneIdle(db) + + db.maybeRunVlogGenerationMaintenanceWithOptions(true, vlogGenerationMaintenanceOptions{ + bypassQuiet: true, + skipRetainedPruneWait: true, + skipCheckpoint: true, + rewriteDebtDrain: true, + }) + + if got := recorder.recordedGCObservedSourceCalls(); got != 2 { + t.Fatalf("observed-source gc calls=%d want 2 when observed source is retained-blocked", got) + } + if got := db.vlogGenerationLastGCObservedSourceSegmentsEligible.Load(); got != 1 { + t.Fatalf("last observed source eligible segments=%d want 1 after second gc", got) + } + if got := db.vlogGenerationLastGCObservedSourceBytesDeleted.Load(); got != 64 { + t.Fatalf("last observed source deleted bytes=%d want 64 after second gc", got) + } +} + +func TestVlogGenerationObservedSourceGCQueue_CountersAndDedupe(t *testing.T) { + db := &DB{} + + db.queueVlogGenerationObservedSourceGCList([]uint32{7, 9, 7, 0}) + db.queueVlogGenerationObservedSourceGCIDs(map[uint32]struct{}{ + 0: {}, + 9: {}, + 12: {}, + }) + + if got := db.vlogGenerationObservedGCQueuedBatches.Load(); got != 2 { + t.Fatalf("queued batches=%d want 2", got) + } + if got := db.vlogGenerationObservedGCQueuedIDs.Load(); got != 3 { + t.Fatalf("queued ids=%d want 3", got) + } + + ids := db.takeVlogGenerationObservedSourceGCList() + sort.Slice(ids, func(i, j int) bool { return ids[i] < ids[j] }) + want := []uint32{7, 9, 12} + if len(ids) != len(want) { + t.Fatalf("taken ids len=%d want %d (%v)", len(ids), len(want), ids) + } + for i := range ids { + if ids[i] != want[i] { + t.Fatalf("taken ids[%d]=%d want %d (all=%v)", i, ids[i], want[i], ids) + } + } + + if got := db.vlogGenerationObservedGCTakenBatches.Load(); got != 1 { + t.Fatalf("taken batches=%d want 1", got) + } + if got := db.vlogGenerationObservedGCTakenIDs.Load(); got != uint64(len(want)) { + t.Fatalf("taken ids=%d want %d", got, len(want)) + } + + // Empty take should not mutate taken counters. + _ = db.takeVlogGenerationObservedSourceGCList() + if got := db.vlogGenerationObservedGCTakenBatches.Load(); got != 1 { + t.Fatalf("taken batches after empty take=%d want 1", got) + } + if got := db.vlogGenerationObservedGCTakenIDs.Load(); got != uint64(len(want)) { + t.Fatalf("taken ids after empty take=%d want %d", got, len(want)) + } +} + +func TestVlogGenerationMaintenance_ObservedSourceGCBypassQuietIgnoresForegroundResume(t *testing.T) { + prepareDirectSchedulerTest(t) + t.Setenv(envDisableVlogGenerationRewrite, "1") + + dir := t.TempDir() + backend, err := backenddb.Open(backenddb.Options{Dir: dir}) + if err != nil { + t.Fatalf("open backend: %v", err) + } + recorder := &rewriteBudgetRecordingBackend{ + DB: backend, + gcFn: func(ctx context.Context, _ backenddb.ValueLogGCOptions) (backenddb.ValueLogGCStats, error) { + select { + case <-time.After(200 * time.Millisecond): + if err := ctx.Err(); err != nil { + return backenddb.ValueLogGCStats{}, err + } + return backenddb.ValueLogGCStats{}, nil + case <-ctx.Done(): + return backenddb.ValueLogGCStats{}, ctx.Err() + } + }, + } + + db, cleanup := openRewriteQueueTestDB(t, dir, recorder) + defer cleanup() + skipRetainedPrune(db) + + db.queueVlogGenerationObservedSourceGCList([]uint32{11}) + db.vlogGenerationLastGCUnixNano.Store(time.Now().Add(-time.Minute).UnixNano()) + forceVlogMaintenanceIdle(db) + + go func() { + time.Sleep(30 * time.Millisecond) + hot := time.Now().UnixNano() + db.lastForegroundWriteUnixNano.Store(hot) + db.lastForegroundReadUnixNano.Store(hot) + }() + + db.maybeRunVlogGenerationMaintenanceWithOptions(true, vlogGenerationMaintenanceOptions{ + bypassQuiet: true, + skipRetainedPruneWait: true, + skipCheckpoint: true, + rewriteDebtDrain: true, + }) + + if got := recorder.recordedGCObservedSourceCalls(); got != 1 { + t.Fatalf("observed-source gc calls=%d want 1", got) + } + if got := db.vlogGenerationGCRuns.Load(); got != 1 { + t.Fatalf("gc runs=%d want 1", got) + } + if got := db.vlogGenerationObservedGCRetryQueued.Load(); got != 0 { + t.Fatalf("observed-source gc retry queued=%d want 0", got) + } + if pending := len(db.takeVlogGenerationObservedSourceGCList()); pending != 0 { + t.Fatalf("observed-source gc pending ids=%d want 0", pending) + } +} + +func TestVlogGenerationMaintenance_ObservedSourceGCCompletionClearsRetryState(t *testing.T) { + prepareDirectSchedulerTest(t) + t.Setenv(envDisableVlogGenerationRewrite, "1") + + dir := t.TempDir() + backend, err := backenddb.Open(backenddb.Options{Dir: dir}) + if err != nil { + t.Fatalf("open backend: %v", err) + } + recorder := &rewriteBudgetRecordingBackend{ + DB: backend, + gcResponse: backenddb.ValueLogGCStats{ + ObservedSourceSegments: 1, + ObservedSourceSegmentsEligible: 1, + ObservedSourceSegmentsDeleted: 1, + ObservedSourceBytes: 256, + ObservedSourceBytesEligible: 256, + ObservedSourceBytesDeleted: 256, + }, + } + + db, cleanup := openRewriteQueueTestDB(t, dir, recorder) + defer cleanup() + skipRetainedPrune(db) + + db.queueVlogGenerationObservedSourceGCList([]uint32{41}) + db.vlogGenerationLastGCUnixNano.Store(time.Now().Add(-time.Minute).UnixNano()) + forceVlogMaintenanceIdle(db) + + db.maybeRunVlogGenerationMaintenanceWithOptions(true, vlogGenerationMaintenanceOptions{ + bypassQuiet: true, + skipRetainedPruneWait: true, + skipCheckpoint: true, + rewriteDebtDrain: true, + }) + + if got := recorder.recordedGCObservedSourceCalls(); got != 1 { + t.Fatalf("observed-source gc calls=%d want 1", got) + } + if got := db.vlogGenerationObservedGCRetryQueued.Load(); got != 0 { + t.Fatalf("observed-source gc retry queued=%d want 0", got) + } + if got := db.vlogGenerationObservedGCRetryDropped.Load(); got != 0 { + t.Fatalf("observed-source gc retry dropped=%d want 0", got) + } + if got := db.vlogGenerationObservedGCLatencyCompletedIDs.Load(); got != 1 { + t.Fatalf("observed-source gc latency completed ids=%d want 1", got) + } + if got := db.vlogGenerationObservedGCLatencyDroppedIDs.Load(); got != 0 { + t.Fatalf("observed-source gc latency dropped ids=%d want 0", got) + } + if pending := len(db.takeVlogGenerationObservedSourceGCList()); pending != 0 { + t.Fatalf("observed-source gc pending ids=%d want 0", pending) + } + db.vlogGenerationObservedGCMu.Lock() + if _, exists := db.vlogGenerationObservedGCRetryAttempts[41]; exists { + db.vlogGenerationObservedGCMu.Unlock() + t.Fatalf("retry attempt state still present for observed id 41") + } + if _, exists := db.vlogGenerationObservedGCFirstQueuedUnixNano[41]; exists { + db.vlogGenerationObservedGCMu.Unlock() + t.Fatalf("first queued timestamp still present for observed id 41") + } + db.vlogGenerationObservedGCMu.Unlock() +} + +func TestVlogGenerationMaintenance_ObservedSourceGCRetryBudgetDropsAfterMaxAttempts(t *testing.T) { + prepareDirectSchedulerTest(t) + t.Setenv(envDisableVlogGenerationRewrite, "1") + + dir := t.TempDir() + backend, err := backenddb.Open(backenddb.Options{Dir: dir}) + if err != nil { + t.Fatalf("open backend: %v", err) + } + recorder := &rewriteBudgetRecordingBackend{ + DB: backend, + gcResponse: backenddb.ValueLogGCStats{ + ObservedSourceSegments: 1, + ObservedSourceSegmentsEligible: 0, + ObservedSourceSegmentsProtectedRetained: 1, + ObservedSourceBytes: 128, + ObservedSourceBytesProtectedRetained: 128, + }, + } + + db, cleanup := openRewriteQueueTestDB(t, dir, recorder) + defer cleanup() + skipRetainedPrune(db) + + db.queueVlogGenerationObservedSourceGCList([]uint32{73}) + passes := int(vlogGenerationObservedGCRetryMaxAttempts) + 1 + for i := 0; i < passes; i++ { + db.vlogGenerationLastGCUnixNano.Store(time.Now().Add(-time.Minute).UnixNano()) + forceVlogMaintenanceIdle(db) + db.maybeRunVlogGenerationMaintenanceWithOptions(true, vlogGenerationMaintenanceOptions{ + bypassQuiet: true, + skipRetainedPruneWait: true, + skipCheckpoint: true, + rewriteDebtDrain: true, + }) + } + + if got := recorder.recordedGCObservedSourceCalls(); got != passes { + t.Fatalf("observed-source gc calls=%d want %d", got, passes) + } + if got := db.vlogGenerationObservedGCRetryQueued.Load(); got != uint64(vlogGenerationObservedGCRetryMaxAttempts) { + t.Fatalf("observed-source gc retry queued=%d want %d", got, vlogGenerationObservedGCRetryMaxAttempts) + } + if got := db.vlogGenerationObservedGCRetryDropped.Load(); got != 1 { + t.Fatalf("observed-source gc retry dropped=%d want 1", got) + } + if got := db.vlogGenerationObservedGCLatencyCompletedIDs.Load(); got != 0 { + t.Fatalf("observed-source gc latency completed ids=%d want 0", got) + } + if got := db.vlogGenerationObservedGCLatencyDroppedIDs.Load(); got != 1 { + t.Fatalf("observed-source gc latency dropped ids=%d want 1", got) + } + if pending := len(db.takeVlogGenerationObservedSourceGCList()); pending != 0 { + t.Fatalf("observed-source gc pending ids=%d want 0", pending) + } + db.vlogGenerationObservedGCMu.Lock() + if _, exists := db.vlogGenerationObservedGCRetryAttempts[73]; exists { + db.vlogGenerationObservedGCMu.Unlock() + t.Fatalf("retry attempt state still present for observed id 73 after drop") + } + if _, exists := db.vlogGenerationObservedGCFirstQueuedUnixNano[73]; exists { + db.vlogGenerationObservedGCMu.Unlock() + t.Fatalf("first queued timestamp still present for observed id 73 after drop") + } + db.vlogGenerationObservedGCMu.Unlock() +} + +func TestVlogGenerationRewrite_FreshPlanExecIgnoresForegroundCancelUntilBoundedComplete(t *testing.T) { prepareDirectSchedulerTest(t) dir := t.TempDir() @@ -871,39 +1503,37 @@ func TestVlogGenerationRewrite_CanceledFreshPlanQueuesPendingResume(t *testing.T select { case <-done: - case <-time.After(2 * wait): - t.Fatalf("initial rewrite did not cancel under foreground activity") + t.Fatalf("rewrite completed early under foreground activity; expected bounded fresh-plan rewrite to continue until release (ctx_ttl=%s)", blocking.recordedRewriteTTL()) + case <-time.After(250 * time.Millisecond): } - deadline := time.Now().Add(2 * wait) - for blocking.recordedRewriteCalls() < 2 { - if time.Now().After(deadline) { - t.Fatalf("pending checkpoint-kick resume did not run (calls=%d)", blocking.recordedRewriteCalls()) - } - time.Sleep(10 * time.Millisecond) + releaseRewrite() + select { + case <-done: + case <-time.After(2 * wait): + t.Fatalf("rewrite did not finish after release") } if ttl := blocking.recordedRewriteTTL(); ttl < 20*time.Second { - t.Fatalf("resume rewrite context ttl=%s want around %s", ttl, vlogGenerationRewriteBoundedExecTimeout) + t.Fatalf("fresh-plan rewrite context ttl=%s want around %s", ttl, vlogGenerationRewriteBoundedExecTimeout) } - releaseRewrite() - deadline = time.Now().Add(2 * wait) - for { - queue, qerr := db.currentVlogGenerationRewriteQueue() - if qerr != nil { - t.Fatalf("load rewrite queue: %v", qerr) - } - if len(queue) == 0 { - break - } - if time.Now().After(deadline) { - t.Fatalf("rewrite queue not drained after resume release: queue=%v calls=%d", queue, blocking.recordedRewriteCalls()) - } - time.Sleep(10 * time.Millisecond) + queue, qerr := db.currentVlogGenerationRewriteQueue() + if qerr != nil { + t.Fatalf("load rewrite queue: %v", qerr) + } + if len(queue) != 0 { + t.Fatalf("rewrite queue not drained after release: queue=%v calls=%d", queue, blocking.recordedRewriteCalls()) } + stats := db.Stats() - if got := stats["treedb.cache.vlog_generation.rewrite.canceled_runs"]; got != "1" { - t.Fatalf("rewrite canceled runs=%q want 1", got) + if got := stats["treedb.cache.vlog_generation.rewrite.canceled_runs"]; got != "0" { + t.Fatalf("rewrite canceled runs=%q want 0", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.canceled_runs.fresh_plan"]; got != "0" { + t.Fatalf("rewrite canceled fresh runs=%q want 0", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.canceled_runs.queued_debt"]; got != "0" { + t.Fatalf("rewrite canceled queued runs=%q want 0", got) } } @@ -2584,9 +3214,84 @@ func TestVlogGenerationRewritePlan_StageConfirmationExecutesConfirmedSubset(t *t SelectedBytesStale: 56 << 20, }, rewriteResponse: backenddb.ValueLogRewriteStats{ - BytesBefore: 64 << 20, - BytesAfter: 8 << 20, - RecordsCopied: 1, + BytesBefore: 64 << 20, + BytesAfter: 8 << 20, + RecordsCopied: 1, + }, + } + + db, cleanup := openRewriteQueueTestDB(t, dir, recorder) + t.Cleanup(cleanup) + db.valueLogRewriteTriggerBytes = 0 + db.valueLogRewriteTriggerRatioPPM = 1 + db.valueLogGenerationHotTarget = 0 + forceVlogMaintenanceIdle(db) + + if err := db.setVlogGenerationRewriteLedgerWithStage([]backenddb.ValueLogRewritePlanSegment{ + {FileID: 11, BytesTotal: 64 << 20, BytesLive: 20 << 20, BytesStale: 44 << 20, StaleRatio: 0.6875}, + {FileID: 22, BytesTotal: 64 << 20, BytesLive: 8 << 20, BytesStale: 56 << 20, StaleRatio: 0.875}, + }, true, time.Now().Add(-vlogGenerationRewriteMinInterval-time.Second).UnixNano()); err != nil { + t.Fatalf("seed staged rewrite ledger: %v", err) + } + forceRewriteStageConfirmDue(t, db) + + db.maybeRunVlogGenerationMaintenanceWithOptions(false, vlogGenerationMaintenanceOptions{ + bypassQuiet: true, + skipRetainedPruneWait: true, + skipCheckpoint: true, + debugSource: "rewrite_stage_confirm", + }) + + deadline := time.Now().Add(2 * schedulerTestWait(t)) + for { + if _, rewriteCalls := recorder.recordedRewrite(); rewriteCalls >= 1 { + break + } + if time.Now().After(deadline) { + _, rewriteCalls := recorder.recordedRewrite() + t.Fatalf("rewrite calls after staged confirmation=%d want=1", rewriteCalls) + } + time.Sleep(10 * time.Millisecond) + } + rewriteOpts, rewriteCalls := recorder.recordedRewrite() + if rewriteCalls != 1 { + t.Fatalf("rewrite calls after staged confirmation=%d want=1", rewriteCalls) + } + if got, want := rewriteOpts.SourceFileIDs, []uint32{22}; len(got) != len(want) || got[0] != want[0] { + t.Fatalf("rewrite SourceFileIDs after staged confirmation=%v want=%v", got, want) + } +} + +func TestVlogGenerationRewritePlan_StageConfirmationDebtDrainProcessesMultipleSegments(t *testing.T) { + prepareDirectSchedulerTest(t) + + dir := t.TempDir() + backend, err := backenddb.Open(backenddb.Options{Dir: dir}) + if err != nil { + t.Fatalf("open backend: %v", err) + } + recorder := &rewriteBudgetRecordingBackend{ + DB: backend, + planResponse: backenddb.ValueLogRewritePlan{ + SourceFileIDs: []uint32{11, 22, 33}, + SelectedSegments: []backenddb.ValueLogRewritePlanSegment{ + {FileID: 11, BytesTotal: 64 << 20, BytesLive: 8 << 20, BytesStale: 56 << 20, StaleRatio: 0.875}, + {FileID: 22, BytesTotal: 64 << 20, BytesLive: 16 << 20, BytesStale: 48 << 20, StaleRatio: 0.75}, + {FileID: 33, BytesTotal: 64 << 20, BytesLive: 24 << 20, BytesStale: 40 << 20, StaleRatio: 0.625}, + }, + SegmentsTotal: 3, + SegmentsSelected: 3, + BytesTotal: 192 << 20, + BytesLive: 48 << 20, + BytesStale: 144 << 20, + SelectedBytesTotal: 192 << 20, + SelectedBytesLive: 48 << 20, + SelectedBytesStale: 144 << 20, + }, + rewriteResponse: backenddb.ValueLogRewriteStats{ + BytesBefore: 192 << 20, + BytesAfter: 48 << 20, + RecordsCopied: 3, }, } @@ -2595,11 +3300,13 @@ func TestVlogGenerationRewritePlan_StageConfirmationExecutesConfirmedSubset(t *t db.valueLogRewriteTriggerBytes = 0 db.valueLogRewriteTriggerRatioPPM = 1 db.valueLogGenerationHotTarget = 0 + db.vlogGenerationRewriteBudgetTokensBytes.Store(defaultVlogGenerationWarmTargetBytes * 4) forceVlogMaintenanceIdle(db) if err := db.setVlogGenerationRewriteLedgerWithStage([]backenddb.ValueLogRewritePlanSegment{ - {FileID: 11, BytesTotal: 64 << 20, BytesLive: 20 << 20, BytesStale: 44 << 20, StaleRatio: 0.6875}, - {FileID: 22, BytesTotal: 64 << 20, BytesLive: 8 << 20, BytesStale: 56 << 20, StaleRatio: 0.875}, + {FileID: 11, BytesTotal: 64 << 20, BytesLive: 8 << 20, BytesStale: 56 << 20, StaleRatio: 0.875}, + {FileID: 22, BytesTotal: 64 << 20, BytesLive: 16 << 20, BytesStale: 48 << 20, StaleRatio: 0.75}, + {FileID: 33, BytesTotal: 64 << 20, BytesLive: 24 << 20, BytesStale: 40 << 20, StaleRatio: 0.625}, }, true, time.Now().Add(-vlogGenerationRewriteMinInterval-time.Second).UnixNano()); err != nil { t.Fatalf("seed staged rewrite ledger: %v", err) } @@ -2608,27 +3315,20 @@ func TestVlogGenerationRewritePlan_StageConfirmationExecutesConfirmedSubset(t *t db.maybeRunVlogGenerationMaintenanceWithOptions(false, vlogGenerationMaintenanceOptions{ bypassQuiet: true, skipRetainedPruneWait: true, - skipCheckpoint: true, + skipCheckpoint: false, + rewriteDebtDrain: true, debugSource: "rewrite_stage_confirm", }) - deadline := time.Now().Add(2 * schedulerTestWait(t)) - for { - if _, rewriteCalls := recorder.recordedRewrite(); rewriteCalls >= 1 { - break - } - if time.Now().After(deadline) { - _, rewriteCalls := recorder.recordedRewrite() - t.Fatalf("rewrite calls after staged confirmation=%d want=1", rewriteCalls) - } - time.Sleep(10 * time.Millisecond) - } rewriteOpts, rewriteCalls := recorder.recordedRewrite() if rewriteCalls != 1 { t.Fatalf("rewrite calls after staged confirmation=%d want=1", rewriteCalls) } - if got, want := rewriteOpts.SourceFileIDs, []uint32{22}; len(got) != len(want) || got[0] != want[0] { - t.Fatalf("rewrite SourceFileIDs after staged confirmation=%v want=%v", got, want) + if got := len(rewriteOpts.SourceFileIDs); got <= 1 { + t.Fatalf("rewrite SourceFileIDs after staged confirmation=%v want multiple ids", rewriteOpts.SourceFileIDs) + } + if got := len(rewriteOpts.SourceFileIDs); got > vlogGenerationRewriteDebtDrainMaxSegments { + t.Fatalf("rewrite SourceFileIDs len=%d want <= %d", got, vlogGenerationRewriteDebtDrainMaxSegments) } } @@ -3609,8 +4309,8 @@ func TestCheckpoint_KicksVlogGenerationRewriteDespiteRecentForegroundActivity(t if _, calls := recorder.recordedPlan(); calls != 1 { t.Fatalf("plan calls=%d want=1", calls) } - if got := db.checkpointRuns.Load(); got < 2 { - t.Fatalf("checkpoint runs=%d want >=2", got) + if got := db.checkpointRuns.Load(); got < 1 { + t.Fatalf("checkpoint runs=%d want >=1", got) } stats := db.Stats() if got := stats["treedb.cache.vlog_generation.checkpoint_kick.runs"]; got != "1" { @@ -3624,6 +4324,52 @@ func TestCheckpoint_KicksVlogGenerationRewriteDespiteRecentForegroundActivity(t } } +func TestCheckpoint_KickHotDebtOnlySkipsFreshPlanDuringRecentForegroundActivity(t *testing.T) { + disableVlogGenerationLoop(t) + t.Setenv(envEnableVlogGenerationCheckpointKickHotDebtOnly, "1") + + dir := t.TempDir() + + backend, err := backenddb.Open(backenddb.Options{Dir: dir}) + if err != nil { + t.Fatalf("open backend: %v", err) + } + recorder := &rewriteBudgetRecordingBackend{ + DB: backend, + planResponse: backenddb.ValueLogRewritePlan{ + SourceFileIDs: []uint32{11}, + SelectedBytesLive: 128, + }, + rewriteResponse: backenddb.ValueLogRewriteStats{BytesBefore: 64, BytesAfter: 32, RecordsCopied: 1}, + } + + db, cleanup := openRewriteQueueTestDB(t, dir, recorder) + t.Cleanup(cleanup) + skipRetainedPrune(db) + db.testSkipVlogCheckpointKick = false + + hot := time.Now().UnixNano() + db.lastForegroundWriteUnixNano.Store(hot) + db.lastForegroundReadUnixNano.Store(hot) + + db.maybeKickVlogGenerationMaintenanceAfterCheckpoint() + + time.Sleep(150 * time.Millisecond) + if _, calls := recorder.recordedPlan(); calls != 0 { + t.Fatalf("plan calls=%d want 0", calls) + } + if _, calls := recorder.recordedRewrite(); calls != 0 { + t.Fatalf("rewrite calls=%d want 0", calls) + } + stats := db.Stats() + if got := stats["treedb.cache.vlog_generation.checkpoint_kick.runs"]; got != "0" { + t.Fatalf("checkpoint kick runs=%q want 0", got) + } + if got := stats["treedb.cache.vlog_generation.checkpoint_kick.skipped_hot_no_debt"]; got != "1" { + t.Fatalf("checkpoint kick skipped_hot_no_debt=%q want 1", got) + } +} + func TestCheckpoint_DoesNotKickVlogGenerationRewrite_WALOn(t *testing.T) { disableVlogGenerationLoop(t) @@ -3731,6 +4477,64 @@ func TestCheckpoint_KicksQueuedRewriteDebtBelowTriggerFloor(t *testing.T) { time.Sleep(10 * time.Millisecond) } + stats := db.Stats() + if got := stats["treedb.cache.vlog_generation.checkpoint_kick.runs"]; got != "1" { + t.Fatalf("checkpoint kick runs=%q want 1", got) + } + if got := stats["treedb.cache.vlog_generation.checkpoint_kick.rewrite_runs"]; got != "1" { + t.Fatalf("checkpoint kick rewrite runs=%q want 1", got) + } + if got := stats["treedb.cache.vlog_generation.checkpoint_kick.skipped_hot_no_debt"]; got != "0" { + t.Fatalf("checkpoint kick skipped_hot_no_debt=%q want 0", got) + } +} + +func TestCheckpoint_KickHotDebtOnlyStillRunsQueuedRewriteDebtDuringRecentForegroundActivity(t *testing.T) { + disableVlogGenerationLoop(t) + t.Setenv(envEnableVlogGenerationCheckpointKickHotDebtOnly, "1") + + dir := t.TempDir() + + backend, err := backenddb.Open(backenddb.Options{Dir: dir}) + if err != nil { + t.Fatalf("open backend: %v", err) + } + recorder := &rewriteBudgetRecordingBackend{ + DB: backend, + planResponse: backenddb.ValueLogRewritePlan{ + SourceFileIDs: []uint32{11}, + SelectedBytesLive: 128, + }, + rewriteResponse: backenddb.ValueLogRewriteStats{BytesBefore: 64, BytesAfter: 32, RecordsCopied: 1}, + } + + db, cleanup := openRewriteQueueTestDB(t, dir, recorder) + t.Cleanup(cleanup) + skipRetainedPrune(db) + db.testSkipVlogCheckpointKick = false + db.valueLogRewriteTriggerBytes = 1 << 30 + if err := db.setVlogGenerationRewriteQueue([]uint32{11}); err != nil { + t.Fatalf("seed rewrite queue: %v", err) + } + db.vlogGenerationRewriteBudgetTokensBytes.Store(1024) + hot := time.Now().UnixNano() + db.lastForegroundWriteUnixNano.Store(hot) + db.lastForegroundReadUnixNano.Store(hot) + + db.maybeKickVlogGenerationMaintenanceAfterCheckpoint() + + deadline := time.Now().Add(2 * schedulerTestWait(t)) + for { + if _, calls := recorder.recordedRewrite(); calls == 1 { + break + } + if time.Now().After(deadline) { + _, rewriteCalls := recorder.recordedRewrite() + t.Fatalf("checkpoint kick with queued debt did not run rewrite in time: rewriteCalls=%d", rewriteCalls) + } + time.Sleep(10 * time.Millisecond) + } + stats := db.Stats() if got := stats["treedb.cache.vlog_generation.checkpoint_kick.runs"]; got != "1" { t.Fatalf("checkpoint kick runs=%q want 1", got) @@ -3789,7 +4593,7 @@ func TestCheckpoint_KickSelfDrainsMaintenanceCollision(t *testing.T) { <-release } -func TestCheckpoint_KicksVlogGenerationGCDespiteRecentForegroundActivity(t *testing.T) { +func TestCheckpoint_KickDoesNotForceGCDuringRecentForegroundActivity(t *testing.T) { disableVlogGenerationLoop(t) t.Setenv(envDisableVlogGenerationRewrite, "1") @@ -3836,102 +4640,261 @@ func TestCheckpoint_KicksVlogGenerationGCDespiteRecentForegroundActivity(t *test t.Fatalf("checkpoint: %v", err) } - deadline := time.Now().Add(2 * schedulerTestWait(t)) - for { - _, realCalls, _ := recorder.recordedCalls() - if realCalls == 1 { - break - } - if time.Now().After(deadline) { - dryCalls, realCalls, _ := recorder.recordedCalls() - t.Fatalf("checkpoint kick did not run gc in time: dryCalls=%d realCalls=%d", dryCalls, realCalls) - } - time.Sleep(10 * time.Millisecond) - } - - if dryCalls, realCalls, _ := recorder.recordedCalls(); dryCalls != 0 || realCalls != 1 { - t.Fatalf("gc calls dry=%d real=%d want dry=0 real=1", dryCalls, realCalls) + time.Sleep(150 * time.Millisecond) + if dryCalls, realCalls, _ := recorder.recordedCalls(); dryCalls != 0 || realCalls != 0 { + t.Fatalf("gc calls dry=%d real=%d want dry=0 real=0", dryCalls, realCalls) } - if got := db.checkpointRuns.Load(); got < 2 { - t.Fatalf("checkpoint runs=%d want >=2", got) + if got := db.checkpointRuns.Load(); got != 1 { + t.Fatalf("checkpoint runs=%d want 1", got) } stats := db.Stats() if got := stats["treedb.cache.vlog_generation.checkpoint_kick.runs"]; got != "1" { t.Fatalf("checkpoint kick runs=%q want 1", got) } - if got := stats["treedb.cache.vlog_generation.checkpoint_kick.gc_runs"]; got != "1" { - t.Fatalf("checkpoint kick gc runs=%q want 1", got) + if got := stats["treedb.cache.vlog_generation.checkpoint_kick.gc_runs"]; got != "0" { + t.Fatalf("checkpoint kick gc runs=%q want 0", got) } if got := stats["treedb.cache.vlog_generation.checkpoint_kick.active"]; got != "false" { t.Fatalf("checkpoint kick active=%q want false", got) } -} - -func TestVlogGenerationMaintenance_PeriodicGCSkipsWhileRewriteAgeBlocked(t *testing.T) { - prepareDirectSchedulerTest(t) - t.Setenv(envDisableVlogGenerationRewrite, "1") - - dir := t.TempDir() +} + +func TestVlogGenerationMaintenance_PeriodicGCSkipsWhileRewriteAgeBlocked(t *testing.T) { + prepareDirectSchedulerTest(t) + t.Setenv(envDisableVlogGenerationRewrite, "1") + + dir := t.TempDir() + + backend, err := backenddb.Open(backenddb.Options{Dir: dir}) + if err != nil { + t.Fatalf("open backend: %v", err) + } + recorder := &rewriteBudgetRecordingBackend{ + DB: backend, + gcResponse: backenddb.ValueLogGCStats{BytesDeleted: 64}, + } + + db, cleanup := openRewriteQueueTestDB(t, dir, recorder) + t.Cleanup(cleanup) + skipRetainedPrune(db) + forceVlogMaintenanceIdle(db) + + db.vlogGenerationRewriteAgeBlockedUntilNS.Store(time.Now().Add(30 * time.Second).UnixNano()) + t.Cleanup(func() { db.vlogGenerationRewriteAgeBlockedUntilNS.Store(0) }) + + db.maybeRunVlogGenerationMaintenance(true) + + if _, calls := recorder.recordedGC(); calls != 0 { + t.Fatalf("periodic GC should yield while rewrite age-blocked; gc calls=%d", calls) + } +} + +func TestVlogGenerationMaintenance_PeriodicGCNoopCooldown(t *testing.T) { + prepareDirectSchedulerTest(t) + t.Setenv(envDisableVlogGenerationRewrite, "1") + + dir := t.TempDir() + + backend, err := backenddb.Open(backenddb.Options{Dir: dir}) + if err != nil { + t.Fatalf("open backend: %v", err) + } + recorder := &rewriteBudgetRecordingBackend{ + DB: backend, + gcResponse: backenddb.ValueLogGCStats{}, + } + + db, cleanup := openRewriteQueueTestDB(t, dir, recorder) + t.Cleanup(cleanup) + skipRetainedPrune(db) + forceVlogMaintenanceIdle(db) + + quietSince := time.Now().Add(-2 * vlogGenerationMaintenanceQuietWindow).UnixNano() + db.lastForegroundWriteUnixNano.Store(quietSince) + db.lastForegroundReadUnixNano.Store(quietSince) + db.activeForegroundIterators.Store(0) + + db.maybeRunVlogGenerationMaintenance(true) + + if _, calls := recorder.recordedGC(); calls != 1 { + t.Fatalf("first periodic GC calls=%d want=1", calls) + } + if got := db.vlogGenerationLastGCNoopUnixNano.Load(); got <= 0 { + t.Fatalf("last GC noop unix nano=%d want >0 after zero-eligibility pass", got) + } + + // Bypass the normal min-interval gate; noop cooldown should still suppress. + db.vlogGenerationLastGCUnixNano.Store(time.Now().Add(-2 * vlogGenerationGCMinInterval).UnixNano()) + forceVlogMaintenanceIdle(db) + db.maybeRunVlogGenerationMaintenance(true) + + if _, calls := recorder.recordedGC(); calls != 1 { + t.Fatalf("periodic GC should skip under noop cooldown; calls=%d want=1", calls) + } +} + +func TestVlogGenerationMaintenance_PeriodicGCSkipsInWALOnMode(t *testing.T) { + prepareDirectSchedulerTest(t) + + dir := t.TempDir() + + backend, err := backenddb.Open(backenddb.Options{Dir: dir}) + if err != nil { + t.Fatalf("open backend: %v", err) + } + recorder := &rewriteBudgetRecordingBackend{ + DB: backend, + gcResponse: backenddb.ValueLogGCStats{BytesDeleted: 64}, + } + + db, err := Open(dir, recorder, Options{ + AllowUnsafe: true, + DisableWAL: false, + JournalLanes: 1, + ValueLogGenerationPolicy: uint8(backenddb.ValueLogGenerationHotWarmCold), + ForceValueLogPointers: true, + }) + if err != nil { + t.Fatalf("open cachingdb: %v", err) + } + t.Cleanup(func() { _ = db.Close() }) + skipRetainedPrune(db) + forceVlogMaintenanceIdle(db) + + db.maybeRunVlogGenerationMaintenance(true) + + if _, calls := recorder.recordedGC(); calls != 0 { + t.Fatalf("periodic GC should skip in WAL-on mode: gc calls=%d", calls) + } + if got := db.checkpointRuns.Load(); got != 0 { + t.Fatalf("checkpoint runs=%d want 0 for WAL-on periodic GC skip", got) + } +} + +func TestVlogGenerationMaintenance_WALOffPreCheckpointSkipsRewriteByDefault(t *testing.T) { + prepareDirectSchedulerTest(t) + + dir := t.TempDir() + backend, err := backenddb.Open(backenddb.Options{Dir: dir}) + if err != nil { + t.Fatalf("open backend: %v", err) + } + recorder := &rewriteBudgetRecordingBackend{ + DB: backend, + planResponse: backenddb.ValueLogRewritePlan{ + SourceFileIDs: []uint32{11}, + SelectedSegments: []backenddb.ValueLogRewritePlanSegment{ + {FileID: 11, BytesTotal: 64, BytesLive: 32, BytesStale: 32, StaleRatio: 0.5}, + }, + SegmentsSelected: 1, + SelectedBytesTotal: 64, + SelectedBytesLive: 32, + SelectedBytesStale: 32, + }, + rewriteResponse: backenddb.ValueLogRewriteStats{BytesBefore: 64, BytesAfter: 32, RecordsCopied: 1}, + } + + db, err := Open(dir, recorder, Options{ + AllowUnsafe: true, + DisableWAL: true, + JournalLanes: 1, + ValueLogGenerationPolicy: uint8(backenddb.ValueLogGenerationHotWarmCold), + ValueLogRewriteTriggerTotalBytes: 1, + ValueLogRewriteBudgetBytesPerSec: 1024, + ForceValueLogPointers: true, + }) + if err != nil { + t.Fatalf("open cachingdb: %v", err) + } + t.Cleanup(func() { _ = db.Close() }) + skipRetainedPrune(db) - backend, err := backenddb.Open(backenddb.Options{Dir: dir}) - if err != nil { - t.Fatalf("open backend: %v", err) + value := make([]byte, 2048) + b := db.NewBatch() + if err := b.Set([]byte("k"), value); err != nil { + _ = b.Close() + t.Fatalf("set: %v", err) } - recorder := &rewriteBudgetRecordingBackend{ - DB: backend, - gcResponse: backenddb.ValueLogGCStats{BytesDeleted: 64}, + if err := b.Write(); err != nil { + _ = b.Close() + t.Fatalf("write: %v", err) } - - db, cleanup := openRewriteQueueTestDB(t, dir, recorder) - t.Cleanup(cleanup) - skipRetainedPrune(db) + _ = b.Close() forceVlogMaintenanceIdle(db) + db.vlogGenerationRewriteBudgetTokensBytes.Store(1024) - db.vlogGenerationRewriteAgeBlockedUntilNS.Store(time.Now().Add(30 * time.Second).UnixNano()) - t.Cleanup(func() { db.vlogGenerationRewriteAgeBlockedUntilNS.Store(0) }) - - db.maybeRunVlogGenerationMaintenance(true) + db.maybeRunVlogGenerationMaintenance(false) - if _, calls := recorder.recordedGC(); calls != 0 { - t.Fatalf("periodic GC should yield while rewrite age-blocked; gc calls=%d", calls) + if _, calls := recorder.recordedRewrite(); calls != 0 { + t.Fatalf("rewrite calls=%d want 0 before first checkpoint", calls) + } + stats := db.Stats() + if got := stats["treedb.cache.vlog_generation.maintenance.skip.before_first_checkpoint"]; got != "1" { + t.Fatalf("pre-checkpoint skip=%q want 1", got) } } -func TestVlogGenerationMaintenance_PeriodicGCSkipsInWALOnMode(t *testing.T) { +func TestVlogGenerationMaintenance_WALOffPreCheckpointCanRunWithEnvOverride(t *testing.T) { prepareDirectSchedulerTest(t) + t.Setenv(envEnableVlogGenerationPreCheckpointRewrite, "1") dir := t.TempDir() - backend, err := backenddb.Open(backenddb.Options{Dir: dir}) if err != nil { t.Fatalf("open backend: %v", err) } recorder := &rewriteBudgetRecordingBackend{ - DB: backend, - gcResponse: backenddb.ValueLogGCStats{BytesDeleted: 64}, + DB: backend, + planResponse: backenddb.ValueLogRewritePlan{ + SourceFileIDs: []uint32{11}, + SelectedSegments: []backenddb.ValueLogRewritePlanSegment{ + {FileID: 11, BytesTotal: 64, BytesLive: 32, BytesStale: 32, StaleRatio: 0.5}, + }, + SegmentsSelected: 1, + SelectedBytesTotal: 64, + SelectedBytesLive: 32, + SelectedBytesStale: 32, + }, + rewriteResponse: backenddb.ValueLogRewriteStats{BytesBefore: 64, BytesAfter: 32, RecordsCopied: 1}, } db, err := Open(dir, recorder, Options{ - AllowUnsafe: true, - DisableWAL: false, - JournalLanes: 1, - ValueLogGenerationPolicy: uint8(backenddb.ValueLogGenerationHotWarmCold), - ForceValueLogPointers: true, + AllowUnsafe: true, + DisableWAL: true, + JournalLanes: 1, + ValueLogGenerationPolicy: uint8(backenddb.ValueLogGenerationHotWarmCold), + ValueLogRewriteTriggerTotalBytes: 1, + ValueLogRewriteBudgetBytesPerSec: 1024, + ForceValueLogPointers: true, }) if err != nil { t.Fatalf("open cachingdb: %v", err) } t.Cleanup(func() { _ = db.Close() }) skipRetainedPrune(db) + + value := make([]byte, 2048) + b := db.NewBatch() + if err := b.Set([]byte("k"), value); err != nil { + _ = b.Close() + t.Fatalf("set: %v", err) + } + if err := b.Write(); err != nil { + _ = b.Close() + t.Fatalf("write: %v", err) + } + _ = b.Close() forceVlogMaintenanceIdle(db) + db.vlogGenerationRewriteBudgetTokensBytes.Store(1024) - db.maybeRunVlogGenerationMaintenance(true) + db.maybeRunVlogGenerationMaintenance(false) - if _, calls := recorder.recordedGC(); calls != 0 { - t.Fatalf("periodic GC should skip in WAL-on mode: gc calls=%d", calls) + if _, calls := recorder.recordedRewrite(); calls != 1 { + t.Fatalf("rewrite calls=%d want 1 with pre-checkpoint override", calls) } - if got := db.checkpointRuns.Load(); got != 0 { - t.Fatalf("checkpoint runs=%d want 0 for WAL-on periodic GC skip", got) + stats := db.Stats() + if got := stats["treedb.cache.vlog_generation.maintenance.skip.before_first_checkpoint"]; got != "0" { + t.Fatalf("pre-checkpoint skip=%q want 0 with override", got) } } @@ -3979,6 +4942,40 @@ func TestVlogGenerationMaintenance_PeriodicSkipsWhenMaintenancePhaseNonSteady(t } } +func TestVlogGenerationMaintenance_PeriodicPreflightSkipsHotNoPending(t *testing.T) { + prepareDirectSchedulerTest(t) + + dir := t.TempDir() + backend, err := backenddb.Open(backenddb.Options{Dir: dir}) + if err != nil { + t.Fatalf("open backend: %v", err) + } + recorder := &rewriteBudgetRecordingBackend{ + DB: backend, + rewriteResponse: backenddb.ValueLogRewriteStats{BytesBefore: 64, BytesAfter: 32, RecordsCopied: 1}, + } + + db, cleanup := openRewriteQueueTestDB(t, dir, recorder) + t.Cleanup(cleanup) + skipRetainedPrune(db) + + hot := time.Now().UnixNano() + db.lastForegroundWriteUnixNano.Store(hot) + db.lastForegroundReadUnixNano.Store(hot) + db.vlogGenerationCheckpointKickPending.Store(false) + db.vlogGenerationDeferredMaintenancePending.Store(false) + + if ran := db.maybeRunPeriodicVlogGenerationMaintenance(false); ran { + t.Fatal("periodic maintenance unexpectedly entered during hot foreground with no pending wake") + } + if got := db.vlogGenerationMaintenanceAttempts.Load(); got != 0 { + t.Fatalf("maintenance attempts=%d want 0 on preflight skip", got) + } + if _, calls := recorder.recordedRewrite(); calls != 0 { + t.Fatalf("rewrite calls=%d want 0 on preflight skip", calls) + } +} + func TestCheckpoint_KickSkipsWhenMaintenancePhaseNonSteady(t *testing.T) { disableVlogGenerationLoop(t) @@ -4309,6 +5306,12 @@ func TestVlogGenerationRewritePlan_TracksEmptyPlanOutcome(t *testing.T) { if got := stats["treedb.cache.vlog_generation.rewrite.plan_empty"]; got != "1" { t.Fatalf("plan empty=%q want 1", got) } + if got := stats["treedb.cache.vlog_generation.rewrite.plan_empty.age_blocked"]; got != "0" { + t.Fatalf("plan empty age-blocked=%q want 0", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.plan_empty.no_selection"]; got != "1" { + t.Fatalf("plan empty no-selection=%q want 1", got) + } if got := stats["treedb.cache.vlog_generation.rewrite.plan_selected"]; got != "0" { t.Fatalf("plan selected=%q want 0", got) } @@ -4318,6 +5321,15 @@ func TestVlogGenerationRewritePlan_TracksEmptyPlanOutcome(t *testing.T) { if got := stats["treedb.cache.vlog_generation.rewrite.plan_errors"]; got != "0" { t.Fatalf("plan errors=%q want 0", got) } + if got := stats["treedb.cache.vlog_generation.rewrite.plan_penalty_filter.runs"]; got != "0" { + t.Fatalf("plan penalty-filter runs=%q want 0", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.plan_penalty_filter.segments"]; got != "0" { + t.Fatalf("plan penalty-filter segments=%q want 0", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.plan_penalty_filter.to_empty_runs"]; got != "0" { + t.Fatalf("plan penalty-filter to-empty=%q want 0", got) + } } func TestVlogGenerationRewritePlan_RunsOutsideMaintenanceBarrier(t *testing.T) { @@ -5431,3 +6443,495 @@ func TestVlogGenerationGC_SkipsDuringRecentForegroundWrites(t *testing.T) { t.Fatalf("gc calls=%d/%d want 0/0 while foreground writes are hot", dryRunCalls, realCalls) } } + +func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) { + prepareDirectSchedulerTest(t) + + dir := t.TempDir() + backend, err := backenddb.Open(backenddb.Options{Dir: dir}) + if err != nil { + t.Fatalf("open backend: %v", err) + } + recorder := &rewriteBudgetRecordingBackend{DB: backend} + + db, cleanup := openRewriteQueueTestDB(t, dir, recorder) + defer cleanup() + + db.vlogGenerationMaintenanceAcquired.Store(2) + db.vlogGenerationMaintenancePassTotalNanos.Store(uint64((40 * time.Millisecond).Nanoseconds())) + db.vlogGenerationMaintenancePassMaxNanos.Store(uint64((30 * time.Millisecond).Nanoseconds())) + db.vlogGenerationRewritePlanRuns.Store(4) + db.vlogGenerationRewritePlanTotalNanos.Store(uint64((80 * time.Millisecond).Nanoseconds())) + db.vlogGenerationRewritePlanMaxNanos.Store(uint64((50 * time.Millisecond).Nanoseconds())) + db.vlogGenerationRewriteRuns.Store(3) + db.vlogGenerationRewriteExecTotalNanos.Store(uint64((150 * time.Millisecond).Nanoseconds())) + db.vlogGenerationRewriteExecMaxNanos.Store(uint64((70 * time.Millisecond).Nanoseconds())) + db.vlogGenerationRewriteBytesIn.Store(1000) + db.vlogGenerationRewriteBytesOut.Store(600) + db.vlogGenerationRewriteReclaimedBytes.Store(400) + db.vlogGenerationGCRuns.Store(2) + db.vlogGenerationGCExecTotalNanos.Store(uint64((60 * time.Millisecond).Nanoseconds())) + db.vlogGenerationGCExecMaxNanos.Store(uint64((35 * time.Millisecond).Nanoseconds())) + db.vlogGenerationVacuumRuns.Store(2) + db.vlogGenerationVacuumExecTotalNanos.Store(uint64((44 * time.Millisecond).Nanoseconds())) + db.vlogGenerationVacuumExecMaxNanos.Store(uint64((25 * time.Millisecond).Nanoseconds())) + db.vlogGenerationRewriteBudgetTokensBytes.Store(512) + db.vlogGenerationRewriteBudgetConsumed.Store(1536) + db.valueLogRewriteBudgetBytes = 2048 + db.vlogGenerationLastChurnBps.Store(2500) + db.vlogGenerationRewriteAgeBlockedUntilNS.Store(time.Now().Add(5 * time.Second).UnixNano()) + db.vlogGenerationLastGCSegmentsReferenced.Store(7) + db.vlogGenerationLastGCBytesReferenced.Store(700) + db.vlogGenerationLastGCSegmentsActive.Store(4) + db.vlogGenerationLastGCBytesActive.Store(400) + db.vlogGenerationLastGCSegmentsProtected.Store(3) + db.vlogGenerationLastGCBytesProtected.Store(300) + db.vlogGenerationLastGCSegmentsProtectedInUse.Store(1) + db.vlogGenerationLastGCBytesProtectedInUse.Store(100) + db.vlogGenerationLastGCSegmentsProtectedRetained.Store(1) + db.vlogGenerationLastGCBytesProtectedRetained.Store(120) + db.vlogGenerationLastGCSegmentsProtectedOverlap.Store(1) + db.vlogGenerationLastGCBytesProtectedOverlap.Store(80) + db.vlogGenerationLastGCSegmentsProtectedOther.Store(0) + db.vlogGenerationLastGCBytesProtectedOther.Store(0) + db.vlogGenerationLastGCSegmentsEligible.Store(6) + db.vlogGenerationLastGCBytesEligible.Store(600) + db.vlogGenerationLastGCSegmentsDeleted.Store(2) + db.vlogGenerationLastGCBytesDeleted.Store(200) + db.vlogGenerationLastGCSegmentsPending.Store(4) + db.vlogGenerationLastGCBytesPending.Store(400) + db.vlogGenerationLastGCObservedSourceSegments.Store(2) + db.vlogGenerationLastGCObservedSourceSegmentsReferenced.Store(0) + db.vlogGenerationLastGCObservedSourceSegmentsActive.Store(0) + db.vlogGenerationLastGCObservedSourceSegmentsProtected.Store(2) + db.vlogGenerationLastGCObservedSourceSegmentsProtectedInUse.Store(0) + db.vlogGenerationLastGCObservedSourceSegmentsProtectedRetained.Store(2) + db.vlogGenerationLastGCObservedSourceSegmentsProtectedOverlap.Store(0) + db.vlogGenerationLastGCObservedSourceSegmentsProtectedOther.Store(0) + db.vlogGenerationLastGCObservedSourceSegmentsEligible.Store(0) + db.vlogGenerationLastGCObservedSourceSegmentsDeleted.Store(0) + db.vlogGenerationLastGCObservedSourceSegmentsPending.Store(0) + db.vlogGenerationLastGCObservedSourceBytes.Store(250) + db.vlogGenerationLastGCObservedSourceBytesReferenced.Store(0) + db.vlogGenerationLastGCObservedSourceBytesActive.Store(0) + db.vlogGenerationLastGCObservedSourceBytesProtected.Store(250) + db.vlogGenerationLastGCObservedSourceBytesProtectedInUse.Store(0) + db.vlogGenerationLastGCObservedSourceBytesProtectedRetained.Store(250) + db.vlogGenerationLastGCObservedSourceBytesProtectedOverlap.Store(0) + db.vlogGenerationLastGCObservedSourceBytesProtectedOther.Store(0) + db.vlogGenerationLastGCObservedSourceBytesEligible.Store(0) + db.vlogGenerationLastGCObservedSourceBytesDeleted.Store(0) + db.vlogGenerationLastGCObservedSourceBytesPending.Store(0) + db.vlogGenerationMaintenanceSkipStageNotDue.Store(5) + db.vlogGenerationMaintenanceSkipStageDue.Store(2) + db.vlogGenerationRewritePlanSelectedSegments.Store(6) + db.vlogGenerationRewriteExecSourceSegments.Store(3) + db.vlogGenerationRewriteSourceSegmentsRequestedTotal.Store(5) + db.vlogGenerationRewriteSourceSegmentsStillReferencedTotal.Store(2) + db.vlogGenerationRewriteSourceSegmentsUnreferencedTotal.Store(3) + db.vlogGenerationRewriteSourceSegmentsRequestedLast.Store(2) + db.vlogGenerationRewriteSourceSegmentsStillReferencedLast.Store(1) + db.vlogGenerationRewriteSourceSegmentsUnreferencedLast.Store(1) + db.vlogGenerationRewriteProcessedLiveBytes.Store(900) + db.vlogGenerationRewriteProcessedStaleBytes.Store(450) + db.vlogGenerationRewriteNoReclaimRuns.Store(3) + db.vlogGenerationRewriteNoReclaimStaleBytes.Store(320) + db.vlogGenerationObservedGCQueuedBatches.Store(5) + db.vlogGenerationObservedGCQueuedIDs.Store(12) + db.vlogGenerationObservedGCTakenBatches.Store(4) + db.vlogGenerationObservedGCTakenIDs.Store(9) + db.vlogGenerationObservedGCRuns.Store(3) + db.vlogGenerationObservedGCRetryQueued.Store(2) + db.vlogGenerationObservedGCRetryDropped.Store(1) + db.vlogGenerationObservedGCLatencyCompletedIDs.Store(6) + db.vlogGenerationObservedGCLatencyDroppedIDs.Store(2) + db.vlogGenerationObservedGCLatencyTotalMS.Store(640) + db.vlogGenerationObservedGCLatencyMaxMS.Store(210) + db.vlogGenerationObservedGCSourceSegmentsTotal.Store(11) + db.vlogGenerationObservedGCSourceSegmentsEligibleTotal.Store(5) + db.vlogGenerationObservedGCSourceSegmentsDeletedTotal.Store(3) + db.vlogGenerationObservedGCSourceSegmentsProtectedInUseTotal.Store(1) + db.vlogGenerationObservedGCSourceSegmentsProtectedRetainedTotal.Store(2) + db.vlogGenerationObservedGCSourceSegmentsProtectedOverlapTotal.Store(3) + db.vlogGenerationObservedGCSourceSegmentsProtectedOtherTotal.Store(4) + db.vlogGenerationObservedGCSourceBytesTotal.Store(1100) + db.vlogGenerationObservedGCSourceBytesEligibleTotal.Store(500) + db.vlogGenerationObservedGCSourceBytesDeletedTotal.Store(300) + db.vlogGenerationObservedGCSourceBytesProtectedInUseTotal.Store(50) + db.vlogGenerationObservedGCSourceBytesProtectedRetainedTotal.Store(250) + db.vlogGenerationObservedGCSourceBytesProtectedOverlapTotal.Store(75) + db.vlogGenerationObservedGCSourceBytesProtectedOtherTotal.Store(25) + + db.vlogGenerationRewriteQueueMu.Lock() + db.vlogGenerationRewriteQueueLoaded = true + db.vlogGenerationRewriteQueue = []uint32{11, 12} + db.vlogGenerationRewriteLedger = []backenddb.ValueLogRewritePlanSegment{ + {FileID: 11, BytesTotal: 1000, BytesLive: 700, BytesStale: 300}, + {FileID: 12, BytesTotal: 500, BytesLive: 500, BytesStale: 0}, + } + db.vlogGenerationRewritePenalties = map[uint32]valueLogGenerationRewritePenalty{ + 11: {Attempts: 1, CooldownUntilUnixNano: time.Now().Add(time.Minute).UnixNano()}, + } + db.vlogGenerationRewriteStagePending = true + db.vlogGenerationRewriteStageObservedUnixNano = 1234 + db.vlogGenerationRewriteQueueMu.Unlock() + db.vlogGenerationObservedGCMu.Lock() + db.vlogGenerationObservedGCSourceIDs = map[uint32]struct{}{ + 101: {}, + 102: {}, + } + db.vlogGenerationObservedGCMu.Unlock() + + stats := db.Stats() + if got := stats["treedb.cache.vlog_generation.maintenance.pass.total_ms"]; got != "40.000" { + t.Fatalf("maintenance pass total ms=%q want 40.000", got) + } + if got := stats["treedb.cache.vlog_generation.maintenance.pass.max_ms"]; got != "30.000" { + t.Fatalf("maintenance pass max ms=%q want 30.000", got) + } + if got := stats["treedb.cache.vlog_generation.maintenance.pass.avg_ms"]; got != "20.000" { + t.Fatalf("maintenance pass avg ms=%q want 20.000", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.plan.total_ms"]; got != "80.000" { + t.Fatalf("rewrite plan total ms=%q want 80.000", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.plan.avg_ms"]; got != "20.000" { + t.Fatalf("rewrite plan avg ms=%q want 20.000", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.exec.total_ms"]; got != "150.000" { + t.Fatalf("rewrite exec total ms=%q want 150.000", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.exec.avg_ms"]; got != "50.000" { + t.Fatalf("rewrite exec avg ms=%q want 50.000", got) + } + if got := stats["treedb.cache.vlog_generation.gc.exec.total_ms"]; got != "60.000" { + t.Fatalf("gc exec total ms=%q want 60.000", got) + } + if got := stats["treedb.cache.vlog_generation.gc.exec.avg_ms"]; got != "30.000" { + t.Fatalf("gc exec avg ms=%q want 30.000", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_referenced_segments"]; got != "7" { + t.Fatalf("gc last referenced segments=%q want 7", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_referenced_bytes"]; got != "700" { + t.Fatalf("gc last referenced bytes=%q want 700", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_active_segments"]; got != "4" { + t.Fatalf("gc last active segments=%q want 4", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_active_bytes"]; got != "400" { + t.Fatalf("gc last active bytes=%q want 400", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_protected_segments"]; got != "3" { + t.Fatalf("gc last protected segments=%q want 3", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_protected_bytes"]; got != "300" { + t.Fatalf("gc last protected bytes=%q want 300", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_protected_in_use_segments"]; got != "1" { + t.Fatalf("gc last protected in use segments=%q want 1", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_protected_in_use_bytes"]; got != "100" { + t.Fatalf("gc last protected in use bytes=%q want 100", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_protected_retained_segments"]; got != "1" { + t.Fatalf("gc last protected retained segments=%q want 1", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_protected_retained_bytes"]; got != "120" { + t.Fatalf("gc last protected retained bytes=%q want 120", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_protected_overlap_segments"]; got != "1" { + t.Fatalf("gc last protected overlap segments=%q want 1", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_protected_overlap_bytes"]; got != "80" { + t.Fatalf("gc last protected overlap bytes=%q want 80", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_protected_other_segments"]; got != "0" { + t.Fatalf("gc last protected other segments=%q want 0", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_protected_other_bytes"]; got != "0" { + t.Fatalf("gc last protected other bytes=%q want 0", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_eligible_segments"]; got != "6" { + t.Fatalf("gc last eligible segments=%q want 6", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_eligible_bytes"]; got != "600" { + t.Fatalf("gc last eligible bytes=%q want 600", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_deleted_segments"]; got != "2" { + t.Fatalf("gc last deleted segments=%q want 2", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_deleted_bytes"]; got != "200" { + t.Fatalf("gc last deleted bytes=%q want 200", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_pending_segments"]; got != "4" { + t.Fatalf("gc last pending segments=%q want 4", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_pending_bytes"]; got != "400" { + t.Fatalf("gc last pending bytes=%q want 400", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.segments"]; got != "2" { + t.Fatalf("gc last observed source segments=%q want 2", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_referenced"]; got != "0" { + t.Fatalf("gc last observed source segments referenced=%q want 0", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_active"]; got != "0" { + t.Fatalf("gc last observed source segments active=%q want 0", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_protected"]; got != "2" { + t.Fatalf("gc last observed source segments protected=%q want 2", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_protected_in_use"]; got != "0" { + t.Fatalf("gc last observed source segments protected in-use=%q want 0", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_protected_retained"]; got != "2" { + t.Fatalf("gc last observed source segments protected retained=%q want 2", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_protected_overlap"]; got != "0" { + t.Fatalf("gc last observed source segments protected overlap=%q want 0", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_protected_other"]; got != "0" { + t.Fatalf("gc last observed source segments protected other=%q want 0", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_eligible"]; got != "0" { + t.Fatalf("gc last observed source segments eligible=%q want 0", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_deleted"]; got != "0" { + t.Fatalf("gc last observed source segments deleted=%q want 0", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_pending"]; got != "0" { + t.Fatalf("gc last observed source segments pending=%q want 0", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes"]; got != "250" { + t.Fatalf("gc last observed source bytes=%q want 250", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_referenced"]; got != "0" { + t.Fatalf("gc last observed source bytes referenced=%q want 0", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_active"]; got != "0" { + t.Fatalf("gc last observed source bytes active=%q want 0", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_protected"]; got != "250" { + t.Fatalf("gc last observed source bytes protected=%q want 250", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_protected_in_use"]; got != "0" { + t.Fatalf("gc last observed source bytes protected in-use=%q want 0", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_protected_retained"]; got != "250" { + t.Fatalf("gc last observed source bytes protected retained=%q want 250", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_protected_overlap"]; got != "0" { + t.Fatalf("gc last observed source bytes protected overlap=%q want 0", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_protected_other"]; got != "0" { + t.Fatalf("gc last observed source bytes protected other=%q want 0", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_eligible"]; got != "0" { + t.Fatalf("gc last observed source bytes eligible=%q want 0", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_deleted"]; got != "0" { + t.Fatalf("gc last observed source bytes deleted=%q want 0", got) + } + if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_pending"]; got != "0" { + t.Fatalf("gc last observed source bytes pending=%q want 0", got) + } + if got := stats["treedb.cache.vlog_generation.vacuum.exec.total_ms"]; got != "44.000" { + t.Fatalf("vacuum exec total ms=%q want 44.000", got) + } + if got := stats["treedb.cache.vlog_generation.vacuum.exec.avg_ms"]; got != "22.000" { + t.Fatalf("vacuum exec avg ms=%q want 22.000", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.ledger_segments"]; got != "2" { + t.Fatalf("rewrite ledger segments=%q want 2", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.ledger_bytes_total"]; got != "1500" { + t.Fatalf("rewrite ledger bytes total=%q want 1500", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.ledger_bytes_live"]; got != "1200" { + t.Fatalf("rewrite ledger bytes live=%q want 1200", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.ledger_bytes_stale"]; got != "300" { + t.Fatalf("rewrite ledger bytes stale=%q want 300", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.ledger_stale_ratio_ppm"]; got != "200000" { + t.Fatalf("rewrite ledger stale ratio ppm=%q want 200000", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.stage_pending"]; got != "true" { + t.Fatalf("rewrite stage pending=%q want true", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.stage_observed_unix_nano"]; got != "1234" { + t.Fatalf("rewrite stage observed=%q want 1234", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.penalties_active"]; got != "1" { + t.Fatalf("rewrite penalties active=%q want 1", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.age_blocked_remaining_ms"]; got == "0" { + t.Fatalf("rewrite age blocked remaining ms=%q want >0", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite_budget.tokens_bytes"]; got != "512" { + t.Fatalf("rewrite budget tokens bytes=%q want 512", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite_budget.consumed_bytes_total"]; got != "1536" { + t.Fatalf("rewrite budget consumed=%q want 1536", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite_budget.consumed_bytes_per_sec"]; got != "10240.000" { + t.Fatalf("rewrite budget consumed bytes/sec=%q want 10240.000", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite_budget.consumed_share_of_budget_pct"]; got != "500.000" { + t.Fatalf("rewrite budget consumed share pct=%q want 500.000", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite_budget.tokens_cap_bytes"]; got == "0" { + t.Fatalf("rewrite budget cap bytes=%q want non-zero", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite_budget.tokens_utilization_pct"]; got == "" { + t.Fatalf("rewrite budget utilization pct missing") + } + if got := stats["treedb.cache.vlog_generation.maintenance.skip.stage_gate_not_due"]; got != "5" { + t.Fatalf("maintenance skip stage gate not due=%q want 5", got) + } + if got := stats["treedb.cache.vlog_generation.maintenance.skip.stage_gate_due_reserved"]; got != "2" { + t.Fatalf("maintenance skip stage gate due reserved=%q want 2", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.plan_selected_segments_total"]; got != "6" { + t.Fatalf("rewrite plan selected segments total=%q want 6", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_total"]; got != "3" { + t.Fatalf("rewrite exec source segments total=%q want 3", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_requested_total"]; got != "5" { + t.Fatalf("rewrite exec source segments requested total=%q want 5", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_still_referenced_total"]; got != "2" { + t.Fatalf("rewrite exec source segments still referenced total=%q want 2", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_unreferenced_total"]; got != "3" { + t.Fatalf("rewrite exec source segments unreferenced total=%q want 3", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_requested_last"]; got != "2" { + t.Fatalf("rewrite exec source segments requested last=%q want 2", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_still_referenced_last"]; got != "1" { + t.Fatalf("rewrite exec source segments still referenced last=%q want 1", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_unreferenced_last"]; got != "1" { + t.Fatalf("rewrite exec source segments unreferenced last=%q want 1", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.processed_live_bytes"]; got != "900" { + t.Fatalf("rewrite processed live bytes=%q want 900", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.processed_stale_bytes"]; got != "450" { + t.Fatalf("rewrite processed stale bytes=%q want 450", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.reclaim_ratio"]; got != "0.400000" { + t.Fatalf("rewrite reclaim ratio=%q want 0.400000", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.output_ratio"]; got != "0.600000" { + t.Fatalf("rewrite output ratio=%q want 0.600000", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.processed_stale_ratio"]; got != "0.333333" { + t.Fatalf("rewrite processed stale ratio=%q want 0.333333", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.exec.bytes_in_per_sec"]; got != "6666.667" { + t.Fatalf("rewrite exec bytes in/sec=%q want 6666.667", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.exec.bytes_out_per_sec"]; got != "4000.000" { + t.Fatalf("rewrite exec bytes out/sec=%q want 4000.000", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.exec.reclaimed_bytes_per_sec"]; got != "2666.667" { + t.Fatalf("rewrite exec reclaimed bytes/sec=%q want 2666.667", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.exec.reclaimed_vs_churn_ratio"]; got != "1.066667" { + t.Fatalf("rewrite reclaimed vs churn ratio=%q want 1.066667", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.no_reclaim_runs"]; got != "3" { + t.Fatalf("rewrite no reclaim runs=%q want 3", got) + } + if got := stats["treedb.cache.vlog_generation.rewrite.no_reclaim_stale_bytes"]; got != "320" { + t.Fatalf("rewrite no reclaim stale bytes=%q want 320", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.pending_ids"]; got != "2" { + t.Fatalf("observed gc pending ids=%q want 2", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.queued_batches"]; got != "5" { + t.Fatalf("observed gc queued batches=%q want 5", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.queued_ids"]; got != "12" { + t.Fatalf("observed gc queued ids=%q want 12", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.taken_batches"]; got != "4" { + t.Fatalf("observed gc taken batches=%q want 4", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.taken_ids"]; got != "9" { + t.Fatalf("observed gc taken ids=%q want 9", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.runs"]; got != "3" { + t.Fatalf("observed gc runs=%q want 3", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.retry_queued"]; got != "2" { + t.Fatalf("observed gc retry queued=%q want 2", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.retry_dropped"]; got != "1" { + t.Fatalf("observed gc retry dropped=%q want 1", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.retry_max_attempts"]; got != "3" { + t.Fatalf("observed gc retry max attempts=%q want 3", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.latency.completed_ids"]; got != "6" { + t.Fatalf("observed gc latency completed ids=%q want 6", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.latency.dropped_ids"]; got != "2" { + t.Fatalf("observed gc latency dropped ids=%q want 2", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.latency.total_ms"]; got != "640" { + t.Fatalf("observed gc latency total ms=%q want 640", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.latency.max_ms"]; got != "210" { + t.Fatalf("observed gc latency max ms=%q want 210", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.latency.avg_ms"]; got != "80.000" { + t.Fatalf("observed gc latency avg ms=%q want 80.000", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.source_segments_total"]; got != "11" { + t.Fatalf("observed gc source segments total=%q want 11", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.source_segments_eligible_total"]; got != "5" { + t.Fatalf("observed gc source segments eligible total=%q want 5", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.source_segments_deleted_total"]; got != "3" { + t.Fatalf("observed gc source segments deleted total=%q want 3", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.source_segments_protected_in_use_total"]; got != "1" { + t.Fatalf("observed gc source segments protected in-use total=%q want 1", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.source_segments_protected_retained_total"]; got != "2" { + t.Fatalf("observed gc source segments protected retained total=%q want 2", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.source_segments_protected_overlap_total"]; got != "3" { + t.Fatalf("observed gc source segments protected overlap total=%q want 3", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.source_segments_protected_other_total"]; got != "4" { + t.Fatalf("observed gc source segments protected other total=%q want 4", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.source_bytes_total"]; got != "1100" { + t.Fatalf("observed gc source bytes total=%q want 1100", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.source_bytes_eligible_total"]; got != "500" { + t.Fatalf("observed gc source bytes eligible total=%q want 500", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.source_bytes_deleted_total"]; got != "300" { + t.Fatalf("observed gc source bytes deleted total=%q want 300", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.source_bytes_protected_in_use_total"]; got != "50" { + t.Fatalf("observed gc source bytes protected in-use total=%q want 50", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.source_bytes_protected_retained_total"]; got != "250" { + t.Fatalf("observed gc source bytes protected retained total=%q want 250", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.source_bytes_protected_overlap_total"]; got != "75" { + t.Fatalf("observed gc source bytes protected overlap total=%q want 75", got) + } + if got := stats["treedb.cache.vlog_generation.observed_gc.source_bytes_protected_other_total"]; got != "25" { + t.Fatalf("observed gc source bytes protected other total=%q want 25", got) + } +} diff --git a/TreeDB/db/db.go b/TreeDB/db/db.go index ae4b583d3..d7b2f7c05 100644 --- a/TreeDB/db/db.go +++ b/TreeDB/db/db.go @@ -280,6 +280,11 @@ type ValueLogGenerationConfig struct { // RewriteTriggerChurnPerSec triggers rewrite when churn rate exceeds // threshold (0 disables). RewriteTriggerChurnPerSec int64 + // RewriteMinSegmentAge gates online rewrite to source segments that are at + // least this old. + // + // 0 uses the implementation default. + RewriteMinSegmentAge time.Duration } // ValueLogDomainThreshold overrides inline-vs-pointer placement policy for keys @@ -968,6 +973,9 @@ func validateOptions(opts Options) error { if opts.ValueLog.Generational.RewriteTriggerChurnPerSec < 0 { return fmt.Errorf("treedb: invalid value-log generational rewrite trigger churn/sec %d", opts.ValueLog.Generational.RewriteTriggerChurnPerSec) } + if opts.ValueLog.Generational.RewriteMinSegmentAge < 0 { + return fmt.Errorf("treedb: invalid value-log generational rewrite min segment age %s", opts.ValueLog.Generational.RewriteMinSegmentAge) + } seenDomains := make(map[string]struct{}, len(opts.ValueLog.DomainInlineThresholds)) for i := range opts.ValueLog.DomainInlineThresholds { d := opts.ValueLog.DomainInlineThresholds[i] diff --git a/TreeDB/db/vlog_gc.go b/TreeDB/db/vlog_gc.go index d67d0aaf1..258c03b5a 100644 --- a/TreeDB/db/vlog_gc.go +++ b/TreeDB/db/vlog_gc.go @@ -17,24 +17,68 @@ const valueLogKeepRecentSegmentsPerLane = 2 // ValueLogGCOptions controls value-log garbage collection. type ValueLogGCOptions struct { - DryRun bool + DryRun bool + // ProtectedPaths preserves legacy callers that provide a single merged set + // of protected paths. Prefer the specific ProtectedInUsePaths and + // ProtectedRetainedPaths fields for blocker classification. ProtectedPaths []string + // ProtectedInUsePaths are paths that may still be referenced by mutable + // in-memory state during online maintenance. + ProtectedInUsePaths []string + // ProtectedRetainedPaths are paths pinned by pointer lifecycle retention. + ProtectedRetainedPaths []string + // ObservedSourceFileIDs enables per-classification probe counters for a + // caller-provided subset of segment IDs (for example, rewrite-selected + // source segments). IDs not present in the current set are ignored. + ObservedSourceFileIDs []uint32 } // ValueLogGCStats summarizes value-log GC work. type ValueLogGCStats struct { - SegmentsTotal int - SegmentsReferenced int - SegmentsActive int - SegmentsProtected int - SegmentsEligible int - SegmentsDeleted int - BytesTotal int64 - BytesReferenced int64 - BytesActive int64 - BytesProtected int64 - BytesEligible int64 - BytesDeleted int64 + SegmentsTotal int + SegmentsReferenced int + SegmentsActive int + SegmentsProtected int + SegmentsProtectedInUse int + SegmentsProtectedRetained int + SegmentsProtectedOverlap int + SegmentsProtectedOther int + SegmentsEligible int + SegmentsDeleted int + SegmentsPending int + BytesTotal int64 + BytesReferenced int64 + BytesActive int64 + BytesProtected int64 + BytesProtectedInUse int64 + BytesProtectedRetained int64 + BytesProtectedOverlap int64 + BytesProtectedOther int64 + BytesEligible int64 + BytesDeleted int64 + BytesPending int64 + ObservedSourceSegments int + ObservedSourceSegmentsReferenced int + ObservedSourceSegmentsActive int + ObservedSourceSegmentsProtected int + ObservedSourceSegmentsProtectedInUse int + ObservedSourceSegmentsProtectedRetained int + ObservedSourceSegmentsProtectedOverlap int + ObservedSourceSegmentsProtectedOther int + ObservedSourceSegmentsEligible int + ObservedSourceSegmentsDeleted int + ObservedSourceSegmentsPending int + ObservedSourceBytes int64 + ObservedSourceBytesReferenced int64 + ObservedSourceBytesActive int64 + ObservedSourceBytesProtected int64 + ObservedSourceBytesProtectedInUse int64 + ObservedSourceBytesProtectedRetained int64 + ObservedSourceBytesProtectedOverlap int64 + ObservedSourceBytesProtectedOther int64 + ObservedSourceBytesEligible int64 + ObservedSourceBytesDeleted int64 + ObservedSourceBytesPending int64 } // ValueLogGC deletes fully-unreferenced value-log segments. @@ -81,8 +125,9 @@ func (db *DB) ValueLogGC(ctx context.Context, opts ValueLogGCOptions) (ValueLogG set = vm.CurrentSetNoRefresh() } keptIDs := currentValueLogIDs(set) - if len(opts.ProtectedPaths) > 0 { - if recent := recentValueLogIDsForProtectedPaths(set, valueLogKeepRecentSegmentsPerLane, opts.ProtectedPaths); len(recent) > 0 { + protectedAll := mergeUniqueNonEmptyPaths(opts.ProtectedPaths, opts.ProtectedInUsePaths, opts.ProtectedRetainedPaths) + if len(protectedAll) > 0 { + if recent := recentValueLogIDsForProtectedPaths(set, valueLogKeepRecentSegmentsPerLane, protectedAll); len(recent) > 0 { keptIDs = recent } } @@ -93,38 +138,120 @@ func (db *DB) ValueLogGC(ctx context.Context, opts ValueLogGCOptions) (ValueLogG } protectedPaths[path] = struct{}{} } + protectedInUsePaths := make(map[string]struct{}, len(opts.ProtectedInUsePaths)) + for _, path := range opts.ProtectedInUsePaths { + if path == "" { + continue + } + protectedInUsePaths[path] = struct{}{} + } + protectedRetainedPaths := make(map[string]struct{}, len(opts.ProtectedRetainedPaths)) + for _, path := range opts.ProtectedRetainedPaths { + if path == "" { + continue + } + protectedRetainedPaths[path] = struct{}{} + } type candidate struct { - path string - size int64 + path string + size int64 + observed bool } candidates := make(map[uint32]candidate) + observedSourceIDs := make(map[uint32]struct{}, len(opts.ObservedSourceFileIDs)) + for _, id := range opts.ObservedSourceFileIDs { + if id == 0 { + continue + } + observedSourceIDs[id] = struct{}{} + } for id, f := range set.Files { if err := ctx.Err(); err != nil { return stats, err } size := fileSize(f) + observed := false + if _, ok := observedSourceIDs[id]; ok { + observed = true + stats.ObservedSourceSegments++ + stats.ObservedSourceBytes += size + } stats.SegmentsTotal++ stats.BytesTotal += size if _, ok := referenced[id]; ok { stats.SegmentsReferenced++ stats.BytesReferenced += size + if observed { + stats.ObservedSourceSegmentsReferenced++ + stats.ObservedSourceBytesReferenced += size + } continue } if _, ok := keptIDs[id]; ok { stats.SegmentsActive++ stats.BytesActive += size + if observed { + stats.ObservedSourceSegmentsActive++ + stats.ObservedSourceBytesActive += size + } + continue + } + _, inUseProtected := protectedInUsePaths[f.Path] + _, retainedProtected := protectedRetainedPaths[f.Path] + if inUseProtected || retainedProtected { + stats.SegmentsProtected++ + stats.BytesProtected += size + if observed { + stats.ObservedSourceSegmentsProtected++ + stats.ObservedSourceBytesProtected += size + } + switch { + case inUseProtected && retainedProtected: + stats.SegmentsProtectedOverlap++ + stats.BytesProtectedOverlap += size + if observed { + stats.ObservedSourceSegmentsProtectedOverlap++ + stats.ObservedSourceBytesProtectedOverlap += size + } + case inUseProtected: + stats.SegmentsProtectedInUse++ + stats.BytesProtectedInUse += size + if observed { + stats.ObservedSourceSegmentsProtectedInUse++ + stats.ObservedSourceBytesProtectedInUse += size + } + default: + stats.SegmentsProtectedRetained++ + stats.BytesProtectedRetained += size + if observed { + stats.ObservedSourceSegmentsProtectedRetained++ + stats.ObservedSourceBytesProtectedRetained += size + } + } continue } if _, ok := protectedPaths[f.Path]; ok { stats.SegmentsProtected++ stats.BytesProtected += size + stats.SegmentsProtectedOther++ + stats.BytesProtectedOther += size + if observed { + stats.ObservedSourceSegmentsProtected++ + stats.ObservedSourceBytesProtected += size + stats.ObservedSourceSegmentsProtectedOther++ + stats.ObservedSourceBytesProtectedOther += size + } continue } stats.SegmentsEligible++ stats.BytesEligible += size + if observed { + stats.ObservedSourceSegmentsEligible++ + stats.ObservedSourceBytesEligible += size + } if opts.DryRun { continue @@ -132,7 +259,7 @@ func (db *DB) ValueLogGC(ctx context.Context, opts ValueLogGCOptions) (ValueLogG if err := vm.MarkZombie(id); err != nil { return stats, err } - candidates[id] = candidate{path: f.Path, size: size} + candidates[id] = candidate{path: f.Path, size: size, observed: observed} } if opts.DryRun { @@ -159,11 +286,27 @@ func (db *DB) ValueLogGC(ctx context.Context, opts ValueLogGCOptions) (ValueLogG if os.IsNotExist(err) { stats.SegmentsDeleted++ stats.BytesDeleted += info.size + if info.observed { + stats.ObservedSourceSegmentsDeleted++ + stats.ObservedSourceBytesDeleted += info.size + } } else { return stats, err } } } + if stats.SegmentsEligible > stats.SegmentsDeleted { + stats.SegmentsPending = stats.SegmentsEligible - stats.SegmentsDeleted + } + if stats.BytesEligible > stats.BytesDeleted { + stats.BytesPending = stats.BytesEligible - stats.BytesDeleted + } + if stats.ObservedSourceSegmentsEligible > stats.ObservedSourceSegmentsDeleted { + stats.ObservedSourceSegmentsPending = stats.ObservedSourceSegmentsEligible - stats.ObservedSourceSegmentsDeleted + } + if stats.ObservedSourceBytesEligible > stats.ObservedSourceBytesDeleted { + stats.ObservedSourceBytesPending = stats.ObservedSourceBytesEligible - stats.ObservedSourceBytesDeleted + } currentSet := vm.CurrentSetNoRefresh() if currentSet != nil { @@ -200,6 +343,24 @@ func currentValueLogIDs(set *valuelog.Set) map[uint32]struct{} { return active } +func mergeUniqueNonEmptyPaths(pathSets ...[]string) []string { + seen := make(map[string]struct{}) + var out []string + for _, paths := range pathSets { + for _, path := range paths { + if path == "" { + continue + } + if _, ok := seen[path]; ok { + continue + } + seen[path] = struct{}{} + out = append(out, path) + } + } + return out +} + func recentValueLogIDs(set *valuelog.Set, keepPerLane int) map[uint32]struct{} { if keepPerLane <= 1 { return currentValueLogIDs(set) diff --git a/TreeDB/db/vlog_gc_test.go b/TreeDB/db/vlog_gc_test.go index fbdee4dc0..c6da6710c 100644 --- a/TreeDB/db/vlog_gc_test.go +++ b/TreeDB/db/vlog_gc_test.go @@ -176,6 +176,146 @@ func TestValueLogGC_ProtectedPathsDoNotKeepHistoricalRewriteLanes(t *testing.T) } } +func TestValueLogGC_ProtectedPathBreakdownStats(t *testing.T) { + dir := t.TempDir() + + db, err := Open(Options{Dir: dir}) + if err != nil { + t.Fatalf("open: %v", err) + } + defer func() { _ = db.Close() }() + + for seq := 1; seq <= 5; seq++ { + seq := seq + appendPointersInNewSegment(t, dir, 0, uint32(seq), uint64(seq)*1_000, 1, func(int) []byte { + return bytes.Repeat([]byte(fmt.Sprintf("lane0-seq%d|", seq)), 32) + }) + } + + if err := db.RefreshValueLogSet(); err != nil { + t.Fatalf("RefreshValueLogSet: %v", err) + } + + inUseOnlyPath := filepath.Join(dir, "wal", "value-l0-000001.log") + retainedOnlyPath := filepath.Join(dir, "wal", "value-l0-000002.log") + overlapPath := filepath.Join(dir, "wal", "value-l0-000003.log") + observedInUseID, err := valuelog.EncodeFileID(0, 1) + if err != nil { + t.Fatalf("observed in-use fileid: %v", err) + } + observedRetainedID, err := valuelog.EncodeFileID(0, 2) + if err != nil { + t.Fatalf("observed retained fileid: %v", err) + } + observedOverlapID, err := valuelog.EncodeFileID(0, 3) + if err != nil { + t.Fatalf("observed overlap fileid: %v", err) + } + + stats, err := db.ValueLogGC(context.Background(), ValueLogGCOptions{ + DryRun: true, + ProtectedInUsePaths: []string{inUseOnlyPath, overlapPath}, + ProtectedRetainedPaths: []string{retainedOnlyPath, overlapPath}, + ObservedSourceFileIDs: []uint32{observedInUseID, observedRetainedID, observedOverlapID}, + }) + if err != nil { + t.Fatalf("ValueLogGC: %v", err) + } + + if stats.SegmentsTotal != 5 { + t.Fatalf("segments total=%d want 5", stats.SegmentsTotal) + } + if stats.SegmentsActive != 2 { + t.Fatalf("segments active=%d want 2", stats.SegmentsActive) + } + if stats.SegmentsProtected != 3 { + t.Fatalf("segments protected=%d want 3", stats.SegmentsProtected) + } + if stats.SegmentsProtectedInUse != 1 { + t.Fatalf("segments protected in-use=%d want 1", stats.SegmentsProtectedInUse) + } + if stats.SegmentsProtectedRetained != 1 { + t.Fatalf("segments protected retained=%d want 1", stats.SegmentsProtectedRetained) + } + if stats.SegmentsProtectedOverlap != 1 { + t.Fatalf("segments protected overlap=%d want 1", stats.SegmentsProtectedOverlap) + } + if stats.SegmentsProtectedOther != 0 { + t.Fatalf("segments protected other=%d want 0", stats.SegmentsProtectedOther) + } + if stats.SegmentsEligible != 0 { + t.Fatalf("segments eligible=%d want 0", stats.SegmentsEligible) + } + if stats.SegmentsDeleted != 0 { + t.Fatalf("segments deleted=%d want 0", stats.SegmentsDeleted) + } + if stats.BytesProtected <= 0 { + t.Fatalf("bytes protected=%d want >0", stats.BytesProtected) + } + if stats.BytesProtectedInUse <= 0 || stats.BytesProtectedRetained <= 0 || stats.BytesProtectedOverlap <= 0 { + t.Fatalf("expected non-zero protected byte buckets, got %+v", stats) + } + if stats.BytesProtectedOther != 0 { + t.Fatalf("bytes protected other=%d want 0", stats.BytesProtectedOther) + } + if stats.ObservedSourceSegments != 3 { + t.Fatalf("observed source segments=%d want 3", stats.ObservedSourceSegments) + } + if stats.ObservedSourceSegmentsReferenced != 0 { + t.Fatalf("observed source segments referenced=%d want 0", stats.ObservedSourceSegmentsReferenced) + } + if stats.ObservedSourceSegmentsActive != 0 { + t.Fatalf("observed source segments active=%d want 0", stats.ObservedSourceSegmentsActive) + } + if stats.ObservedSourceSegmentsProtected != 3 { + t.Fatalf("observed source segments protected=%d want 3", stats.ObservedSourceSegmentsProtected) + } + if stats.ObservedSourceSegmentsProtectedInUse != 1 { + t.Fatalf("observed source segments protected in-use=%d want 1", stats.ObservedSourceSegmentsProtectedInUse) + } + if stats.ObservedSourceSegmentsProtectedRetained != 1 { + t.Fatalf("observed source segments protected retained=%d want 1", stats.ObservedSourceSegmentsProtectedRetained) + } + if stats.ObservedSourceSegmentsProtectedOverlap != 1 { + t.Fatalf("observed source segments protected overlap=%d want 1", stats.ObservedSourceSegmentsProtectedOverlap) + } + if stats.ObservedSourceSegmentsProtectedOther != 0 { + t.Fatalf("observed source segments protected other=%d want 0", stats.ObservedSourceSegmentsProtectedOther) + } + if stats.ObservedSourceSegmentsEligible != 0 { + t.Fatalf("observed source segments eligible=%d want 0", stats.ObservedSourceSegmentsEligible) + } + if stats.ObservedSourceSegmentsDeleted != 0 { + t.Fatalf("observed source segments deleted=%d want 0", stats.ObservedSourceSegmentsDeleted) + } + if stats.ObservedSourceSegmentsPending != 0 { + t.Fatalf("observed source segments pending=%d want 0", stats.ObservedSourceSegmentsPending) + } + if stats.ObservedSourceBytes <= 0 { + t.Fatalf("observed source bytes=%d want >0", stats.ObservedSourceBytes) + } + if stats.ObservedSourceBytesProtected <= 0 { + t.Fatalf("observed source bytes protected=%d want >0", stats.ObservedSourceBytesProtected) + } + if stats.ObservedSourceBytesProtectedInUse <= 0 || + stats.ObservedSourceBytesProtectedRetained <= 0 || + stats.ObservedSourceBytesProtectedOverlap <= 0 { + t.Fatalf("expected non-zero observed source protected byte buckets, got %+v", stats) + } + if stats.ObservedSourceBytesProtectedOther != 0 { + t.Fatalf("observed source bytes protected other=%d want 0", stats.ObservedSourceBytesProtectedOther) + } + if stats.ObservedSourceBytesEligible != 0 { + t.Fatalf("observed source bytes eligible=%d want 0", stats.ObservedSourceBytesEligible) + } + if stats.ObservedSourceBytesDeleted != 0 { + t.Fatalf("observed source bytes deleted=%d want 0", stats.ObservedSourceBytesDeleted) + } + if stats.ObservedSourceBytesPending != 0 { + t.Fatalf("observed source bytes pending=%d want 0", stats.ObservedSourceBytesPending) + } +} + func TestValueLogGC_KeepsReferencedPointerSegments_WithOuterLeavesInValueLog(t *testing.T) { dir := t.TempDir() diff --git a/TreeDB/db/vlog_rewrite.go b/TreeDB/db/vlog_rewrite.go index efb73b610..133d1de4b 100644 --- a/TreeDB/db/vlog_rewrite.go +++ b/TreeDB/db/vlog_rewrite.go @@ -31,6 +31,7 @@ const defaultValueLogRewriteSegmentBytes = 128 << 20 const rewriteDictMinPayloadBytes = 32 << 10 const rewriteDictBatchMaxK = 64 +const rewriteReadScratchMaxCap = 1 << 20 // 1MiB cap to avoid retaining oversized decode buffers func rewriteAllowDictForSmallPayload(value []byte) bool { if len(value) < page.PageSize { @@ -49,6 +50,23 @@ type ValueLogRewriteStats struct { BytesBefore int64 BytesAfter int64 RecordsCopied int + // Value* counters track key/value-pointer payload copied by the main rewrite + // pointer swap path. + ValueRecordsCopied int + ValueBytesCopied int64 + // LeafRef* counters track outer-leaf page payload copied by the leaf-ref + // rewrite path (indexOuterLeavesInValueLog mode). + LeafRefRecordsCopied int + LeafRefBytesCopied int64 + // SourceSegmentsRequested is the number of source segments selected for this + // rewrite run after applying selection filters. + SourceSegmentsRequested int + // SourceSegmentsStillReferenced is the subset of selected source segments + // that remained referenced after rewrite pointer swaps and cleanup. + SourceSegmentsStillReferenced int + // SourceSegmentsUnreferenced is the subset of selected source segments that + // became unreferenced after rewrite pointer swaps and cleanup. + SourceSegmentsUnreferenced int } // ValueLogRewritePlan summarizes which segments a sparse online rewrite would @@ -1180,6 +1198,7 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl } sourceIDs, _ = selectRewriteSourceSegmentsWithStats(opts, set.Files, active, liveByID) restrictSource = true + stats.SourceSegmentsRequested = len(sourceIDs) } _ = db.valueLogManager.Release(set) if restrictSource && len(sourceIDs) == 0 { @@ -1220,6 +1239,7 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl swaps := make([]rewriteSwap, 0, batchSize) localityPolicy := normalizeValueLogRewriteLocalityPolicy(opts.LocalityPolicy) candidates := make([]rewriteCandidate, 0, batchSize) + var rewriteReadScratch []byte var canceledErr error flushBatch := func() error { @@ -1233,7 +1253,7 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl return err } for _, candidate := range candidates { - val, err := db.valueLogManager.Read(candidate.oldPtr) + val, usedScratch, err := db.valueLogManager.ReadUnsafeTo(candidate.oldPtr, rewriteReadScratch) if err != nil { return err } @@ -1241,8 +1261,19 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl if err != nil { return err } + if usedScratch { + // Reuse decode storage across records to reduce alloc churn while + // bounding retained capacity to avoid RSS blow-ups on outliers. + if cap(val) > rewriteReadScratchMaxCap { + rewriteReadScratch = nil + } else { + rewriteReadScratch = val[:0] + } + } startRID++ stats.RecordsCopied++ + stats.ValueRecordsCopied++ + stats.ValueBytesCopied += int64(len(val)) swaps = append(swaps, rewriteSwap{ key: candidate.key, oldPtr: candidate.oldPtr, @@ -1313,11 +1344,13 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl // referenced leaf pages out of the selected source segments so cleanup can // actually reclaim space. if restrictSource && db.indexOuterLeavesInValueLog && len(sourceIDs) > 0 { - copied, err := db.rewriteLeafRefsOnline(ctx, writer, ridAlloc, sourceIDs, opts.SyncEachBatch) + copied, copiedBytes, err := db.rewriteLeafRefsOnline(ctx, writer, ridAlloc, sourceIDs, opts.SyncEachBatch) if err != nil { return stats, err } stats.RecordsCopied += copied + stats.LeafRefRecordsCopied += copied + stats.LeafRefBytesCopied += copiedBytes } } else { // Stop publishing further swaps after cancellation; cleanup below still @@ -1351,6 +1384,16 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl if err != nil { return stats, err } + if len(sourceIDs) > 0 { + stillReferenced := 0 + for id := range sourceIDs { + if _, ok := referencedAfter[id]; ok { + stillReferenced++ + } + } + stats.SourceSegmentsStillReferenced = stillReferenced + stats.SourceSegmentsUnreferenced = len(sourceIDs) - stillReferenced + } var protectedPaths map[string]struct{} allowActiveSkip := len(opts.ProtectedPaths) > 0 if allowActiveSkip { @@ -1453,8 +1496,9 @@ type leafRefRewriteCtx struct { leafMap map[uint64]uint64 // old leafref id -> new leafref id internalMap map[uint64]uint64 // old internal page id -> new page id - retired []uint64 - copied int + retired []uint64 + copied int + copiedBytes int64 } func (c *leafRefRewriteCtx) rewriteNode(id uint64) (uint64, bool, error) { @@ -1511,6 +1555,7 @@ func (c *leafRefRewriteCtx) rewriteNode(id uint64) (uint64, bool, error) { } c.leafMap[id] = leafID c.copied++ + c.copiedBytes += int64(len(leafPage)) return leafID, true, nil } @@ -1608,32 +1653,32 @@ func (c *leafRefRewriteCtx) rewriteNode(id uint64) (uint64, bool, error) { } } -func (db *DB) rewriteLeafRefsOnline(ctx context.Context, writer *rewriteWriter, ridAlloc *rewriteRIDAllocator, sourceIDs map[uint32]struct{}, sync bool) (copied int, err error) { +func (db *DB) rewriteLeafRefsOnline(ctx context.Context, writer *rewriteWriter, ridAlloc *rewriteRIDAllocator, sourceIDs map[uint32]struct{}, sync bool) (copied int, copiedBytes int64, err error) { if db == nil { - return 0, fmt.Errorf("missing db") + return 0, 0, fmt.Errorf("missing db") } if !db.indexOuterLeavesInValueLog { - return 0, nil + return 0, 0, nil } if db.readOnly { - return 0, ErrReadOnly + return 0, 0, ErrReadOnly } if db.valueLogManager == nil { - return 0, fmt.Errorf("value log manager unavailable") + return 0, 0, fmt.Errorf("value log manager unavailable") } if writer == nil || ridAlloc == nil { - return 0, fmt.Errorf("vlog-rewrite: missing writer/rid state") + return 0, 0, fmt.Errorf("vlog-rewrite: missing writer/rid state") } // Treat nil sourceIDs as "all sources" and an empty, non-nil map as "no // sources". The latter means there is nothing to rewrite. if sourceIDs != nil && len(sourceIDs) == 0 { - return 0, nil + return 0, 0, nil } if ctx == nil { ctx = context.Background() } if err := ctx.Err(); err != nil { - return 0, err + return 0, 0, err } db.writeMu.Lock() @@ -1642,7 +1687,7 @@ func (db *DB) rewriteLeafRefsOnline(ctx context.Context, writer *rewriteWriter, snap := db.AcquireSnapshot() if snap == nil || snap.idx == nil || snap.state == nil { closeRewriteSnapshot(&err, snap) - return 0, fmt.Errorf("missing snapshot state") + return 0, 0, fmt.Errorf("missing snapshot state") } defer closeRewriteSnapshot(&err, snap) @@ -1679,33 +1724,33 @@ func (db *DB) rewriteLeafRefsOnline(ctx context.Context, writer *rewriteWriter, newSysRoot, sysChanged, err := leafCtx.rewriteNode(sysRoot) if err != nil { - return 0, err + return 0, 0, err } newRoot, userChanged, err := leafCtx.rewriteNode(rootID) if err != nil { - return 0, err + return 0, 0, err } if !sysChanged && !userChanged { - return 0, nil + return 0, 0, nil } // Ensure the copied leaf-page records are visible before publishing new leaf // refs that point at them. if sync { if err := writer.Sync(); err != nil { - return 0, err + return 0, 0, err } } else { if err := writer.Flush(); err != nil { - return 0, err + return 0, 0, err } } if err := db.finalizeCommit(newRoot, newSysRoot, leafCtx.retired, sync, adaptive.Metrics{}, nil, db.indexOuterLeavesInValueLog, nil); err != nil { - return 0, err + return 0, 0, err } tracker = nil - return leafCtx.copied, nil + return leafCtx.copied, leafCtx.copiedBytes, nil } func nextRewriteRIDStart(segments []logSegment) (uint64, error) { diff --git a/TreeDB/db/vlog_rewrite_test.go b/TreeDB/db/vlog_rewrite_test.go index 8528853ea..91127e148 100644 --- a/TreeDB/db/vlog_rewrite_test.go +++ b/TreeDB/db/vlog_rewrite_test.go @@ -2867,6 +2867,15 @@ func TestValueLogRewriteOnline_SourceFileIDsWithStaleFilterMatchesPlanSelection( if stats.RecordsCopied != 1 { t.Fatalf("expected one rewritten record from selected explicit source, got %d", stats.RecordsCopied) } + if stats.SourceSegmentsRequested != 1 { + t.Fatalf("source segments requested=%d want 1", stats.SourceSegmentsRequested) + } + if stats.SourceSegmentsStillReferenced != 0 { + t.Fatalf("source segments still referenced=%d want 0", stats.SourceSegmentsStillReferenced) + } + if stats.SourceSegmentsUnreferenced != 1 { + t.Fatalf("source segments unreferenced=%d want 1", stats.SourceSegmentsUnreferenced) + } ptrK1, flagsK1 := readProjectedPointerByKey(t, db, []byte("k1")) ptrK2, flagsK2 := readProjectedPointerByKey(t, db, []byte("k2")) diff --git a/TreeDB/env_vlog_overrides_test.go b/TreeDB/env_vlog_overrides_test.go index 019b46ef0..eeebdc1a5 100644 --- a/TreeDB/env_vlog_overrides_test.go +++ b/TreeDB/env_vlog_overrides_test.go @@ -153,3 +153,43 @@ func TestApplyEnvMaintenanceOverrides_VlogDictClassModeDefaultAlias(t *testing.T t.Fatalf("expected dict class mode single for default alias, got %v", got) } } + +func TestApplyEnvMaintenanceOverrides_VlogRetainedCaps(t *testing.T) { + opts := Options{} + t.Setenv(envVlogMaxRetainedBytes, "123456") + t.Setenv(envVlogMaxRetainedBytesHard, "654321") + applyEnvMaintenanceOverrides(&opts) + if got := opts.ValueLog.MaxRetainedBytes; got != 123456 { + t.Fatalf("expected max retained bytes=123456, got %d", got) + } + if got := opts.ValueLog.MaxRetainedBytesHard; got != 654321 { + t.Fatalf("expected max retained bytes hard=654321, got %d", got) + } +} + +func TestApplyEnvMaintenanceOverrides_VlogRewriteControls(t *testing.T) { + opts := Options{} + t.Setenv(envVlogRewriteBudgetBytesPerSec, "123456789") + t.Setenv(envVlogRewriteBudgetRecordsPerSec, "4321") + t.Setenv(envVlogRewriteTriggerTotalBytes, "987654321") + t.Setenv(envVlogRewriteTriggerStaleRatioPPM, "345678") + t.Setenv(envVlogRewriteTriggerChurnPerSec, "13579") + applyEnvMaintenanceOverrides(&opts) + + gen := opts.ValueLog.Generational + if got := gen.RewriteBudgetBytesPerSec; got != 123456789 { + t.Fatalf("expected rewrite budget bytes/sec=123456789, got %d", got) + } + if got := gen.RewriteBudgetRecordsPerSec; got != 4321 { + t.Fatalf("expected rewrite budget records/sec=4321, got %d", got) + } + if got := gen.RewriteTriggerTotalBytes; got != 987654321 { + t.Fatalf("expected rewrite trigger total bytes=987654321, got %d", got) + } + if got := gen.RewriteTriggerStaleRatioPPM; got != 345678 { + t.Fatalf("expected rewrite trigger stale ratio ppm=345678, got %d", got) + } + if got := gen.RewriteTriggerChurnPerSec; got != 13579 { + t.Fatalf("expected rewrite trigger churn/sec=13579, got %d", got) + } +} diff --git a/TreeDB/internal/compression/profile.go b/TreeDB/internal/compression/profile.go index 0114c98e6..2e5e705a3 100644 --- a/TreeDB/internal/compression/profile.go +++ b/TreeDB/internal/compression/profile.go @@ -44,6 +44,13 @@ type kScore struct { score float64 } +const ( + // Bound evaluation work so training cost stays predictable on long streams. + // Use even down-sampling rather than prefix truncation to preserve shape. + maxChooseKEvalSamples = 4096 + maxDecodeCostSamples = 256 +) + func ChooseKForDict(dict []byte, samples [][]byte) (profile *ActiveProfile) { return ChooseKForDictOptions(dict, samples, ChooseKOptions{}) } @@ -59,8 +66,8 @@ func ChooseKForDictOptions(dict []byte, samples [][]byte, opts ChooseKOptions) ( return nil } eval := samples - if len(eval) > 10000 { - eval = eval[:10000] + if len(eval) > maxChooseKEvalSamples { + eval = evenlySampleRecords(eval, maxChooseKEvalSamples) } rawTotal := 0 for _, v := range eval { @@ -70,15 +77,39 @@ func ChooseKForDictOptions(dict []byte, samples [][]byte, opts ChooseKOptions) ( return nil } - nsPerByte := decodeCostEstimate(dict, eval) - if opts.DecodeNsPerRawByte > 0 { - nsPerByte = opts.DecodeNsPerRawByte + nsPerByte := opts.DecodeNsPerRawByte + if nsPerByte <= 0 { + nsPerByte = decodeCostEstimate(dict, eval) } ks := opts.CandidateK if len(ks) == 0 { ks = []int{1, 2, 4, 8, 16, 32} } ks = normalizeCandidateK(ks) + var sharedEnc *zstd.Encoder + if dict != nil { + if enc, err := zstd.NewWriter(nil, + zstd.WithEncoderDict(dict), + zstd.WithEncoderLevel(zstd.SpeedFastest), + zstd.WithEncoderConcurrency(1), + zstd.WithEncoderCRC(false), + ); err == nil { + sharedEnc = enc + } + } else { + if enc, err := zstd.NewWriter(nil, + zstd.WithEncoderLevel(zstd.SpeedFastest), + zstd.WithEncoderConcurrency(1), + zstd.WithEncoderCRC(false), + ); err == nil { + sharedEnc = enc + } + } + if sharedEnc != nil { + defer sharedEnc.Close() + } + var concatScratch []byte + var encodedScratch []byte scores := make([]kScore, 0, len(ks)) var baseline kScore for _, k := range ks { @@ -89,7 +120,12 @@ func ChooseKForDictOptions(dict []byte, samples [][]byte, opts ChooseKOptions) ( if used == 0 { continue } - payload, meta, raw, encodeNs := batchTotals(dict, eval[:used], k, opts.EncodeNsPerRawByte) + payload, meta, raw, encodeNs := 0, 0, 0, int64(0) + if sharedEnc != nil { + payload, meta, raw, encodeNs = batchTotalsWithEncoder(sharedEnc, eval[:used], k, opts.EncodeNsPerRawByte, &concatScratch, &encodedScratch) + } else { + payload, meta, raw, encodeNs = batchTotals(dict, eval[:used], k, opts.EncodeNsPerRawByte) + } if raw == 0 { continue } @@ -200,18 +236,43 @@ func batchTotals(dict []byte, samples [][]byte, k int, encodeNsPerRawByte float6 return 0, 0, 0, 0 } samples = samples[:n] - batches := n / k var enc *zstd.Encoder var err error if dict != nil { - enc, err = zstd.NewWriter(nil, zstd.WithEncoderDict(dict), zstd.WithEncoderLevel(zstd.SpeedFastest)) + enc, err = zstd.NewWriter(nil, + zstd.WithEncoderDict(dict), + zstd.WithEncoderLevel(zstd.SpeedFastest), + zstd.WithEncoderConcurrency(1), + zstd.WithEncoderCRC(false), + ) } else { - enc, err = zstd.NewWriter(nil, zstd.WithEncoderLevel(zstd.SpeedFastest)) + enc, err = zstd.NewWriter(nil, + zstd.WithEncoderLevel(zstd.SpeedFastest), + zstd.WithEncoderConcurrency(1), + zstd.WithEncoderCRC(false), + ) } if err != nil { return 0, 0, 0, 0 } defer enc.Close() + var concatScratch []byte + var encodedScratch []byte + return batchTotalsWithEncoder(enc, samples, k, encodeNsPerRawByte, &concatScratch, &encodedScratch) +} + +func batchTotalsWithEncoder(enc *zstd.Encoder, samples [][]byte, k int, encodeNsPerRawByte float64, concatScratch *[]byte, encodedScratch *[]byte) (payload int, meta int, raw int, encodeNs int64) { + if enc == nil || k <= 0 { + return 0, 0, 0, 0 + } + n := (len(samples) / k) * k + if n == 0 { + return 0, 0, 0, 0 + } + samples = samples[:n] + batches := n / k + buf := *concatScratch + encoded := *encodedScratch started := time.Now() for b := 0; b < batches; b++ { start := b * k @@ -221,14 +282,18 @@ func batchTotals(dict []byte, samples [][]byte, k int, encodeNsPerRawByte float6 raw += len(samples[i]) total += len(samples[i]) } - buf := make([]byte, total) + if cap(buf) < total { + buf = make([]byte, total) + } else { + buf = buf[:total] + } pos := 0 for i := start; i < end; i++ { copy(buf[pos:], samples[i]) pos += len(samples[i]) } - c := enc.EncodeAll(buf, nil) - payload += len(c) + encoded = enc.EncodeAll(buf, encoded[:0]) + payload += len(encoded) // Account for the full on-disk framing overhead: // - record header (CRC/version/flags/txn/bodyLen) // - frame header + dict_id + RID table + offsets table @@ -245,34 +310,45 @@ func batchTotals(dict []byte, samples [][]byte, k int, encodeNsPerRawByte float6 } else { encodeNs = time.Since(started).Nanoseconds() } + *concatScratch = buf[:0] + *encodedScratch = encoded[:0] return payload, meta, raw, encodeNs } func decodeCostEstimate(dict []byte, samples [][]byte) float64 { - n := len(samples) - if n > 500 { - n = 500 + eval := samples + if len(eval) > maxDecodeCostSamples { + eval = evenlySampleRecords(eval, maxDecodeCostSamples) } - enc, err := zstd.NewWriter(nil, zstd.WithEncoderDict(dict), zstd.WithEncoderLevel(zstd.SpeedFastest)) + n := len(eval) + enc, err := zstd.NewWriter(nil, + zstd.WithEncoderDict(dict), + zstd.WithEncoderLevel(zstd.SpeedFastest), + zstd.WithEncoderConcurrency(1), + zstd.WithEncoderCRC(false), + ) if err != nil { return 1.0 } defer enc.Close() - frames := make([][]byte, n) + totalRaw := 0 + var encoded []byte for i := 0; i < n; i++ { - totalRaw += len(samples[i]) - frames[i] = enc.EncodeAll(samples[i], nil) + totalRaw += len(eval[i]) + encoded = enc.EncodeAll(eval[i], encoded[:0]) } dec, err := zstd.NewReader(nil, zstd.WithDecoderDicts(dict)) if err != nil { return 1.0 } defer dec.Close() + var out []byte start := time.Now() for i := 0; i < n; i++ { - out, _ = dec.DecodeAll(frames[i], out[:0]) + encoded = enc.EncodeAll(eval[i], encoded[:0]) + out, _ = dec.DecodeAll(encoded, out[:0]) if len(out) > 0 { _ = out[0] } @@ -283,3 +359,26 @@ func decodeCostEstimate(dict []byte, samples [][]byte) float64 { } return float64(elapsed.Nanoseconds()) / float64(totalRaw) } + +func evenlySampleRecords(samples [][]byte, limit int) [][]byte { + if limit <= 0 || len(samples) <= limit { + return samples + } + out := make([][]byte, 0, limit) + last := -1 + for i := 0; i < limit; i++ { + idx := (i * len(samples)) / limit + if idx >= len(samples) { + idx = len(samples) - 1 + } + if idx <= last { + idx = last + 1 + if idx >= len(samples) { + idx = len(samples) - 1 + } + } + last = idx + out = append(out, samples[idx]) + } + return out +} diff --git a/TreeDB/internal/compression/profile_test.go b/TreeDB/internal/compression/profile_test.go new file mode 100644 index 000000000..1c1c26136 --- /dev/null +++ b/TreeDB/internal/compression/profile_test.go @@ -0,0 +1,93 @@ +package compression + +import ( + "bytes" + "encoding/binary" + "testing" + + "github.com/snissn/compress/zstd" +) + +func buildProfileSamples(n int) [][]byte { + samples := make([][]byte, 0, n) + base := bytes.Repeat([]byte("compressible-"), 64) + for i := 0; i < n; i++ { + buf := make([]byte, 1024) + copy(buf, base) + binary.LittleEndian.PutUint32(buf[len(buf)-4:], uint32(i)) + samples = append(samples, buf) + } + return samples +} + +func mustBuildValidDict(t *testing.T, samples [][]byte) []byte { + t.Helper() + history := make([]byte, 0, 1<<16) + for _, s := range samples { + history = append(history, s...) + } + dict, err := buildAndValidateDict(42, samples, history, zstd.SpeedFastest) + if err != nil { + t.Fatalf("build dict: %v", err) + } + if len(dict) == 0 { + t.Fatalf("expected non-empty dict") + } + return dict +} + +func TestBatchTotalsWithEncoder_MatchesBatchTotals_NoDict(t *testing.T) { + samples := buildProfileSamples(16) + encodeNsPerRawByte := 1.25 + + for _, k := range []int{1, 2, 4, 8} { + wantPayload, wantMeta, wantRaw, wantEncodeNS := batchTotals(nil, samples, k, encodeNsPerRawByte) + enc, err := zstd.NewWriter(nil, + zstd.WithEncoderLevel(zstd.SpeedFastest), + zstd.WithEncoderConcurrency(1), + zstd.WithEncoderCRC(false), + ) + if err != nil { + t.Fatalf("new writer: %v", err) + } + + var concatScratch []byte + var encodedScratch []byte + gotPayload, gotMeta, gotRaw, gotEncodeNS := batchTotalsWithEncoder(enc, samples, k, encodeNsPerRawByte, &concatScratch, &encodedScratch) + _ = enc.Close() + + if gotPayload != wantPayload || gotMeta != wantMeta || gotRaw != wantRaw || gotEncodeNS != wantEncodeNS { + t.Fatalf("k=%d mismatch got=(payload=%d meta=%d raw=%d encodeNs=%d) want=(payload=%d meta=%d raw=%d encodeNs=%d)", + k, gotPayload, gotMeta, gotRaw, gotEncodeNS, wantPayload, wantMeta, wantRaw, wantEncodeNS) + } + } +} + +func TestBatchTotalsWithEncoder_MatchesBatchTotals_WithDict(t *testing.T) { + samples := buildProfileSamples(256) + dict := mustBuildValidDict(t, samples) + encodeNsPerRawByte := 2.0 + + for _, k := range []int{1, 2, 3, 6} { + wantPayload, wantMeta, wantRaw, wantEncodeNS := batchTotals(dict, samples, k, encodeNsPerRawByte) + enc, err := zstd.NewWriter(nil, + zstd.WithEncoderDict(dict), + zstd.WithEncoderLevel(zstd.SpeedFastest), + zstd.WithEncoderConcurrency(1), + zstd.WithEncoderCRC(false), + ) + if err != nil { + t.Fatalf("new dict writer: %v", err) + } + + var concatScratch []byte + var encodedScratch []byte + gotPayload, gotMeta, gotRaw, gotEncodeNS := batchTotalsWithEncoder(enc, samples, k, encodeNsPerRawByte, &concatScratch, &encodedScratch) + _ = enc.Close() + + if gotPayload != wantPayload || gotMeta != wantMeta || gotRaw != wantRaw || gotEncodeNS != wantEncodeNS { + t.Fatalf("k=%d mismatch got=(payload=%d meta=%d raw=%d encodeNs=%d) want=(payload=%d meta=%d raw=%d encodeNs=%d)", + k, gotPayload, gotMeta, gotRaw, gotEncodeNS, wantPayload, wantMeta, wantRaw, wantEncodeNS) + } + } +} diff --git a/TreeDB/internal/compression/trainer.go b/TreeDB/internal/compression/trainer.go index 31c004bbc..ee525935c 100644 --- a/TreeDB/internal/compression/trainer.go +++ b/TreeDB/internal/compression/trainer.go @@ -838,7 +838,12 @@ func (t *Trainer) train(samples [][]byte, dictBytes int, level zstd.EncoderLevel } } - enc, err := zstd.NewWriter(nil, zstd.WithEncoderLevel(level), zstd.WithEncoderCRC(false), zstd.WithEncoderDict(bestProfile.Dict)) + enc, err := zstd.NewWriter(nil, + zstd.WithEncoderLevel(level), + zstd.WithEncoderCRC(false), + zstd.WithEncoderConcurrency(1), + zstd.WithEncoderDict(bestProfile.Dict), + ) if err != nil { log.Printf("treedb: dict training encode setup failed stream=%d err=%v", slabID, err) return @@ -930,7 +935,12 @@ func shapeAndValidateDict(dict []byte, dictBytes int, level zstd.EncoderLevel) ( } func validateDict(dict []byte, level zstd.EncoderLevel) error { - enc, err := zstd.NewWriter(nil, zstd.WithEncoderLevel(level), zstd.WithEncoderCRC(false), zstd.WithEncoderDict(dict)) + enc, err := zstd.NewWriter(nil, + zstd.WithEncoderLevel(level), + zstd.WithEncoderCRC(false), + zstd.WithEncoderConcurrency(1), + zstd.WithEncoderDict(dict), + ) if err != nil { return err } diff --git a/TreeDB/internal/valuelog/manager.go b/TreeDB/internal/valuelog/manager.go index 21c3d5663..f6102a29d 100644 --- a/TreeDB/internal/valuelog/manager.go +++ b/TreeDB/internal/valuelog/manager.go @@ -504,6 +504,11 @@ func (f *File) ReadUnsafeTo(ptr page.ValuePtr, verifyCRC bool, dst []byte) ([]by } } f.mmapReadFallbackReadAt.Add(1) + if !verifyCRC { + if val, usedDst, err, ok := f.readGroupedCompressedFromFileTo(ptr, dst); ok { + return val, usedDst, err + } + } return ReadAtWithDictTo(f.File, ptr, verifyCRC, f.dictLookup, f.templateLookup, f.templateDefCache, f.templateDecodeOpts, dst) } // Avoid per-read Stat/lock churn once we have exhausted the dead-mapping @@ -519,9 +524,185 @@ func (f *File) ReadUnsafeTo(ptr page.ValuePtr, verifyCRC bool, dst []byte) ([]by f.mmapReadMissDeadMappingCap.Add(1) } f.mmapReadFallbackReadAt.Add(1) + if !verifyCRC { + if val, usedDst, err, ok := f.readGroupedCompressedFromFileTo(ptr, dst); ok { + return val, usedDst, err + } + } return ReadAtWithDictTo(f.File, ptr, verifyCRC, f.dictLookup, f.templateLookup, f.templateDefCache, f.templateDecodeOpts, dst) } +// readGroupedCompressedFromFileTo handles grouped+compressed reads on the +// non-mmap fallback path while reusing File grouped-frame cache entries. +// +// ok=false means the caller should fall back to the generic ReadAtWithDictTo +// decoder path (for non-grouped / uncompressed / checksum-verified cases). +func (f *File) readGroupedCompressedFromFileTo(ptr page.ValuePtr, dst []byte) ([]byte, bool, error, bool) { + if f == nil || f.File == nil { + return nil, false, errors.New("valuelog: nil file"), true + } + if ptr.Offset < 4 || !page.ValuePtrIsGrouped(ptr) { + return nil, false, nil, false + } + + start := int64(ptr.Offset - 4) + var header [HeaderSize]byte + if _, err := f.File.ReadAt(header[:], start); err != nil { + return nil, false, err, true + } + if header[4] != Version { + return nil, false, ErrCorrupt, true + } + if header[5]&recordFlagGrouped == 0 { + return nil, false, nil, false + } + valueLen := binary.LittleEndian.Uint32(header[16:20]) + if recordSizeExceedsMax(valueLen) { + return nil, false, ErrRecordTooLarge, true + } + expectedLen := uint32(headerWithoutCRC) + valueLen + if !page.ValuePtrRecordLengthHintMatches(ptr, expectedLen) { + return nil, false, ErrCorrupt, true + } + if int(valueLen) < FrameHeaderSize { + return nil, false, ErrCorrupt, true + } + + frameOff := start + HeaderSize + var frameHeader [FrameHeaderSize]byte + if _, err := f.File.ReadAt(frameHeader[:], frameOff); err != nil { + return nil, false, err, true + } + if frameHeader[0] != FrameVersion { + return nil, false, ErrCorrupt, true + } + k := int(frameHeader[2]) + if k <= 0 || k > MaxFrameK { + return nil, false, ErrCorrupt, true + } + if frameHeader[1]&FrameFlagCompressed == 0 { + return nil, false, nil, false + } + + subIndex := int(page.ValuePtrSubIndex(ptr)) + if subIndex < 0 || subIndex >= k { + return nil, false, ErrCorrupt, true + } + if cachedRaw, valStart, valEnd, rawLen, hit := f.groupedFrameCacheLookup(start, false, subIndex); hit { + if uint32(len(cachedRaw)) != rawLen || valEnd < valStart || valEnd > rawLen { + return nil, false, ErrCorrupt, true + } + val := cachedRaw[valStart:valEnd] + if f.templateLookup != nil && templ.IsEncodedPayload(val) { + decoded, err := templ.DecodePayloadAppend(nil, val, func(id uint64) (templ.TemplateDef, error) { + return resolveTemplateDef(id, f.templateLookup, f.templateDefCache) + }, f.templateDecodeOpts) + if err != nil { + return nil, false, err, true + } + return decoded, false, nil, true + } + if dst != nil && cap(dst) >= len(val) { + out := dst[:len(val)] + copy(out, val) + return out, true, nil, true + } + out := make([]byte, len(val)) + copy(out, val) + return out, false, nil, true + } + + ridBytes := k * 8 + offsetBytes := (k + 1) * 4 + prefixLen := FrameHeaderSize + ridBytes + offsetBytes + if int(valueLen) < prefixLen { + return nil, false, ErrCorrupt, true + } + + payloadScratch := getDecodeScratch(int(valueLen)) + defer putDecodeScratch(payloadScratch) + payload := payloadScratch[:int(valueLen)] + if _, err := f.File.ReadAt(payload, start+HeaderSize); err != nil { + return nil, false, err, true + } + + off := FrameHeaderSize + ridBytes + var offsets [MaxFrameK + 1]uint32 + prev := uint32(0) + for i := 0; i < k+1; i++ { + cur := binary.LittleEndian.Uint32(payload[off : off+4]) + if cur < prev { + return nil, false, ErrCorrupt, true + } + offsets[i] = cur + prev = cur + off += 4 + } + rawLen := offsets[k] + if limits.MaxRecordSize > 0 && int64(rawLen) > limits.MaxRecordSize { + return nil, false, ErrRecordTooLarge, true + } + valStart := offsets[subIndex] + valEnd := offsets[subIndex+1] + if valEnd < valStart || valEnd > rawLen { + return nil, false, ErrCorrupt, true + } + + frame := FrameHeader{ + Version: frameHeader[0], + Flags: frameHeader[1], + K: uint8(k), + Reserved: frameHeader[3], + DictID: binary.LittleEndian.Uint64(frameHeader[4:12]), + } + + raw := f.takeDecodeScratch(int(rawLen)) + pooledRaw := true + raw, err := decodeFramePayloadTo(frame, payload[prefixLen:], f.dictLookup, rawLen, raw) + if err != nil { + if pooledRaw { + f.releaseDecodeScratch(raw) + } + return nil, false, err, true + } + if uint32(len(raw)) != rawLen { + if pooledRaw { + f.releaseDecodeScratch(raw) + } + return nil, false, ErrCorrupt, true + } + cachedRaw := f.groupedFrameCacheStore(start, false, k, offsets, raw, true) + + val := raw[valStart:valEnd] + if f.templateLookup != nil && templ.IsEncodedPayload(val) { + decoded, err := templ.DecodePayloadAppend(nil, val, func(id uint64) (templ.TemplateDef, error) { + return resolveTemplateDef(id, f.templateLookup, f.templateDefCache) + }, f.templateDecodeOpts) + if pooledRaw && !cachedRaw { + f.releaseDecodeScratch(raw) + } + if err != nil { + return nil, false, err, true + } + return decoded, false, nil, true + } + + if dst != nil && cap(dst) >= len(val) { + out := dst[:len(val)] + copy(out, val) + if pooledRaw && !cachedRaw { + f.releaseDecodeScratch(raw) + } + return out, true, nil, true + } + out := make([]byte, len(val)) + copy(out, val) + if pooledRaw && !cachedRaw { + f.releaseDecodeScratch(raw) + } + return out, false, nil, true +} + func (f *File) ReadAppend(ptr page.ValuePtr, verifyCRC bool, dst []byte) ([]byte, error) { if f == nil || f.File == nil { return nil, errors.New("valuelog: nil file") @@ -1290,6 +1471,50 @@ func (m *Manager) RemapStats() (remaps uint64, deadMappings uint64) { return remaps, deadMappings } +func valueLogFileSizeBestEffort(f *File) uint64 { + if f == nil { + return 0 + } + if known := f.fileSize.Load(); known > 0 { + return uint64(known) + } + if data, _ := f.mmapData.Load().([]byte); len(data) > 0 { + return uint64(len(data)) + } + if f.Path != "" { + if info, err := os.Stat(f.Path); err == nil && info.Size() > 0 { + return uint64(info.Size()) + } + } + return 0 +} + +// ZombieStats reports tracked zombie segments and their approximate byte totals. +// A zombie remains on disk until all snapshots release it (RefCount reaches 0). +func (m *Manager) ZombieStats() (segments uint64, bytes uint64, pinnedSegments uint64, pinnedBytes uint64, unpinnedSegments uint64, unpinnedBytes uint64) { + if m == nil { + return 0, 0, 0, 0, 0, 0 + } + m.mu.RLock() + for _, f := range m.files { + if f == nil || !f.IsZombie.Load() { + continue + } + segments++ + size := valueLogFileSizeBestEffort(f) + bytes += size + if f.RefCount.Load() > 0 { + pinnedSegments++ + pinnedBytes += size + continue + } + unpinnedSegments++ + unpinnedBytes += size + } + m.mu.RUnlock() + return segments, bytes, pinnedSegments, pinnedBytes, unpinnedSegments, unpinnedBytes +} + // MmapResidencyStats reports aggregate mmap residency split by segment type: // current writable segments, sealed segments, and dead mappings/bytes. func (m *Manager) MmapResidencyStats() (currentSegments uint64, currentBytes uint64, sealedSegments uint64, sealedBytes uint64, deadMappings uint64, deadBytes uint64) { diff --git a/TreeDB/internal/valuelog/manager_test.go b/TreeDB/internal/valuelog/manager_test.go index d6cd2e780..e2b3fd43c 100644 --- a/TreeDB/internal/valuelog/manager_test.go +++ b/TreeDB/internal/valuelog/manager_test.go @@ -92,6 +92,38 @@ func TestManagerMmapResidencyStatsAggregatesCounters(t *testing.T) { } } +func TestManagerZombieStatsAggregatesPinnedAndUnpinned(t *testing.T) { + mgr := &Manager{ + files: map[uint32]*File{ + 1: {}, + 2: {}, + 3: {}, + }, + } + // Zombie + pinned. + mgr.files[1].IsZombie.Store(true) + mgr.files[1].RefCount.Store(2) + mgr.files[1].fileSize.Store(100) + // Zombie + unpinned. + mgr.files[2].IsZombie.Store(true) + mgr.files[2].RefCount.Store(0) + mgr.files[2].fileSize.Store(200) + // Non-zombie should be ignored. + mgr.files[3].RefCount.Store(9) + mgr.files[3].fileSize.Store(300) + + segments, bytes, pinnedSegments, pinnedBytes, unpinnedSegments, unpinnedBytes := mgr.ZombieStats() + if segments != 2 || bytes != 300 { + t.Fatalf("ZombieStats total mismatch: segments=%d bytes=%d want segments=2 bytes=300", segments, bytes) + } + if pinnedSegments != 1 || pinnedBytes != 100 { + t.Fatalf("ZombieStats pinned mismatch: segments=%d bytes=%d want segments=1 bytes=100", pinnedSegments, pinnedBytes) + } + if unpinnedSegments != 1 || unpinnedBytes != 200 { + t.Fatalf("ZombieStats unpinned mismatch: segments=%d bytes=%d want segments=1 bytes=200", unpinnedSegments, unpinnedBytes) + } +} + func TestManagerPromoteCurrentWritable_SwitchesPriorLaneSegmentToSealed(t *testing.T) { mgr := &Manager{ files: make(map[uint32]*File), diff --git a/TreeDB/internal/valuelog/valuelog_test.go b/TreeDB/internal/valuelog/valuelog_test.go index b974c59ff..96ade0364 100644 --- a/TreeDB/internal/valuelog/valuelog_test.go +++ b/TreeDB/internal/valuelog/valuelog_test.go @@ -802,6 +802,92 @@ func TestValueLogManager_GroupedFrameCache_MaxRawBytesSkipsOversize(t *testing.T } } +func TestValueLogManager_ReadUnsafeTo_CompressedGroupedFallbackUsesCache(t *testing.T) { + if runtime.GOOS == "windows" { + t.Skip("mmap not supported on windows") + } + + // Force file-read fallback so this test exercises the non-mmap path. + withMappedSealedBudget(t, 0) + + dir := t.TempDir() + fileID, err := EncodeFileID(0, 1) + if err != nil { + t.Fatalf("encode file id: %v", err) + } + path := filepath.Join(dir, "value-l0-000001.log") + + writer, err := NewWriter(path, fileID) + if err != nil { + t.Fatalf("new writer: %v", err) + } + writer.SetBlockCompression(BlockCodecSnappy, true) + ptrs, want := appendCompressedFrameForCacheTests(t, writer, 0, 4) + if err := writer.Close(); err != nil { + t.Fatalf("close: %v", err) + } + + m, err := NewManager(dir) + if err != nil { + t.Fatalf("new manager: %v", err) + } + defer func() { _ = m.Close() }() + m.SetDisableReadChecksum(true) + m.SetGroupedFrameCacheEntries(4) + + f := m.files[fileID] + if f == nil { + t.Fatalf("missing opened file for id=%d", fileID) + } + + dst := make([]byte, 0, 512) + got0, used0, err := m.ReadUnsafeTo(ptrs[0], dst[:0]) + if err != nil { + t.Fatalf("read unsafe to first: %v", err) + } + if !used0 { + t.Fatalf("expected first read to use dst") + } + if !bytes.Equal(got0, want[0]) { + t.Fatalf("first value mismatch: got=%q want=%q", got0, want[0]) + } + + hits0, misses0, entries0, _ := f.groupedFrameCacheStats() + if misses0 == 0 { + t.Fatalf("expected first compressed grouped read to miss cache") + } + if entries0 == 0 { + t.Fatalf("expected first compressed grouped read to populate cache") + } + + got1, used1, err := m.ReadUnsafeTo(ptrs[1], dst[:0]) + if err != nil { + t.Fatalf("read unsafe to second: %v", err) + } + if !used1 { + t.Fatalf("expected second read to use dst") + } + if !bytes.Equal(got1, want[1]) { + t.Fatalf("second value mismatch: got=%q want=%q", got1, want[1]) + } + + hits1, misses1, entries1, _ := f.groupedFrameCacheStats() + if hits1 <= hits0 { + t.Fatalf("expected second read to hit grouped cache: hits before=%d after=%d", hits0, hits1) + } + if misses1 != misses0 { + t.Fatalf("unexpected cache miss increase on second read: before=%d after=%d", misses0, misses1) + } + if entries1 == 0 { + t.Fatalf("expected grouped cache entries to remain populated") + } + + _, _, missNoMapping, _, fallbacks := m.MmapReadStats() + if missNoMapping == 0 || fallbacks == 0 { + t.Fatalf("expected fallback path stats to reflect no-mmap reads: miss_no_mapping=%d fallbacks=%d", missNoMapping, fallbacks) + } +} + func TestReadAtGroupedFastPathWithoutChecksum(t *testing.T) { dir := t.TempDir() path := filepath.Join(dir, "value-000001.log") diff --git a/TreeDB/public.go b/TreeDB/public.go index 619c9051f..8436b482a 100644 --- a/TreeDB/public.go +++ b/TreeDB/public.go @@ -590,6 +590,7 @@ func Open(opts Options) (*DB, error) { ValueLogRewriteTriggerStaleRatioPPM: opts.ValueLog.Generational.RewriteTriggerStaleRatioPPM, ValueLogRewriteTriggerTotalBytes: opts.ValueLog.Generational.RewriteTriggerTotalBytes, ValueLogRewriteTriggerChurnPerSec: opts.ValueLog.Generational.RewriteTriggerChurnPerSec, + ValueLogRewriteMinSegmentAge: opts.ValueLog.Generational.RewriteMinSegmentAge, ForceValueLogPointers: opts.ValueLog.ForcePointers, ValueLogDictTrain: opts.ValueLog.DictTrain, ValueLogDictMaxK: opts.ValueLog.DictMaxK, @@ -696,20 +697,27 @@ const ( // - Dict training enabled (TrainBytes > 0), and // - Side stores enabled (dictdb), and // - Split value log enabled (value pointers used). - envVlogDictEnable = "TREEDB_VLOG_DICT_ENABLE" // bool - envVlogDictTrainBytes = "TREEDB_VLOG_DICT_TRAIN_BYTES" // int - envVlogDictBytes = "TREEDB_VLOG_DICT_BYTES" // int - envVlogDictMinRecords = "TREEDB_VLOG_DICT_MIN_RECORDS" // int - envVlogDictMaxRecordBytes = "TREEDB_VLOG_DICT_MAX_RECORD_BYTES" // int - envVlogDictSampleStride = "TREEDB_VLOG_DICT_SAMPLE_STRIDE" // int - envVlogDictDedupWindow = "TREEDB_VLOG_DICT_DEDUP_WINDOW" // int - envVlogDictTrainLevel = "TREEDB_VLOG_DICT_TRAIN_LEVEL" // int - envVlogDictMaxK = "TREEDB_VLOG_DICT_MAX_K" // int - envVlogDictClassMode = "TREEDB_VLOG_DICT_CLASS_MODE" // single|split_outer_leaf - envVlogDictZstdLevel = "TREEDB_VLOG_DICT_ZSTD_LEVEL" // fastest|default|better|best|int - envVlogDictEntropy = "TREEDB_VLOG_DICT_ENTROPY" // bool - envVlogDictAdaptiveRatio = "TREEDB_VLOG_DICT_ADAPTIVE_RATIO" // float64 - envVlogDictMinPayloadSavings = "TREEDB_VLOG_DICT_MIN_PAYLOAD_SAVINGS_RATIO" // float64 + envVlogDictEnable = "TREEDB_VLOG_DICT_ENABLE" // bool + envVlogDictTrainBytes = "TREEDB_VLOG_DICT_TRAIN_BYTES" // int + envVlogDictBytes = "TREEDB_VLOG_DICT_BYTES" // int + envVlogDictMinRecords = "TREEDB_VLOG_DICT_MIN_RECORDS" // int + envVlogDictMaxRecordBytes = "TREEDB_VLOG_DICT_MAX_RECORD_BYTES" // int + envVlogDictSampleStride = "TREEDB_VLOG_DICT_SAMPLE_STRIDE" // int + envVlogDictDedupWindow = "TREEDB_VLOG_DICT_DEDUP_WINDOW" // int + envVlogDictTrainLevel = "TREEDB_VLOG_DICT_TRAIN_LEVEL" // int + envVlogDictMaxK = "TREEDB_VLOG_DICT_MAX_K" // int + envVlogDictClassMode = "TREEDB_VLOG_DICT_CLASS_MODE" // single|split_outer_leaf + envVlogDictZstdLevel = "TREEDB_VLOG_DICT_ZSTD_LEVEL" // fastest|default|better|best|int + envVlogDictEntropy = "TREEDB_VLOG_DICT_ENTROPY" // bool + envVlogDictAdaptiveRatio = "TREEDB_VLOG_DICT_ADAPTIVE_RATIO" // float64 + envVlogDictMinPayloadSavings = "TREEDB_VLOG_DICT_MIN_PAYLOAD_SAVINGS_RATIO" // float64 + envVlogMaxRetainedBytes = "TREEDB_VLOG_MAX_RETAINED_BYTES" // int64 + envVlogMaxRetainedBytesHard = "TREEDB_VLOG_MAX_RETAINED_BYTES_HARD" // int64 + envVlogRewriteBudgetBytesPerSec = "TREEDB_VLOG_REWRITE_BUDGET_BYTES_PER_SEC" // int64 + envVlogRewriteBudgetRecordsPerSec = "TREEDB_VLOG_REWRITE_BUDGET_RECORDS_PER_SEC" // int + envVlogRewriteTriggerTotalBytes = "TREEDB_VLOG_REWRITE_TRIGGER_TOTAL_BYTES" // int64 + envVlogRewriteTriggerStaleRatioPPM = "TREEDB_VLOG_REWRITE_TRIGGER_STALE_RATIO_PPM" // uint32 + envVlogRewriteTriggerChurnPerSec = "TREEDB_VLOG_REWRITE_TRIGGER_CHURN_PER_SEC" // int64 ) func applyEnvMaintenanceOverrides(opts *Options) { @@ -828,6 +836,30 @@ func applyEnvMaintenanceOverrides(opts *Options) { if v, ok := envFloat64(envVlogDictMinPayloadSavings); ok { opts.ValueLog.DictMinPayloadSavingsRatio = v } + if v, ok := envInt(envVlogMaxRetainedBytes); ok { + opts.ValueLog.MaxRetainedBytes = int64(v) + } + if v, ok := envInt(envVlogMaxRetainedBytesHard); ok { + opts.ValueLog.MaxRetainedBytesHard = int64(v) + } + if v, ok := envInt(envVlogRewriteBudgetBytesPerSec); ok { + opts.ValueLog.Generational.RewriteBudgetBytesPerSec = int64(v) + } + if v, ok := envInt(envVlogRewriteBudgetRecordsPerSec); ok { + opts.ValueLog.Generational.RewriteBudgetRecordsPerSec = v + } + if v, ok := envInt(envVlogRewriteTriggerTotalBytes); ok { + opts.ValueLog.Generational.RewriteTriggerTotalBytes = int64(v) + } + if v, ok := envInt(envVlogRewriteTriggerStaleRatioPPM); ok { + if v < 0 { + v = 0 + } + opts.ValueLog.Generational.RewriteTriggerStaleRatioPPM = uint32(v) + } + if v, ok := envInt(envVlogRewriteTriggerChurnPerSec); ok { + opts.ValueLog.Generational.RewriteTriggerChurnPerSec = int64(v) + } } func computeDurabilityMode(opts Options) string { diff --git a/TreeDB/vlog_rewrite.go b/TreeDB/vlog_rewrite.go index fd7879b87..e685aec54 100644 --- a/TreeDB/vlog_rewrite.go +++ b/TreeDB/vlog_rewrite.go @@ -8,11 +8,18 @@ import ( // ValueLogRewriteStats summarizes value-log rewrite compaction results. type ValueLogRewriteStats struct { - SegmentsBefore int - SegmentsAfter int - BytesBefore int64 - BytesAfter int64 - RecordsCopied int + SegmentsBefore int + SegmentsAfter int + BytesBefore int64 + BytesAfter int64 + RecordsCopied int + ValueRecordsCopied int + ValueBytesCopied int64 + LeafRefRecordsCopied int + LeafRefBytesCopied int64 + SourceSegmentsRequested int + SourceSegmentsStillReferenced int + SourceSegmentsUnreferenced int } // ValueLogRewriteOnlineOptions controls online rewrite batching behavior. diff --git a/cmd/unified_bench/README.md b/cmd/unified_bench/README.md index 93c26afdd..85ef9e258 100644 --- a/cmd/unified_bench/README.md +++ b/cmd/unified_bench/README.md @@ -95,6 +95,7 @@ GOWORK=off GOMEMLIMIT=4GiB GOMAXPROCS=2 go test -json -p 1 . \ - `-treedb-allow-unsafe` TreeDB: allow unsafe durability/integrity options (required for unsafe toggles) - `-treedb-vlog-dict` TreeDB: value-log dict compression mode (`default|on|off|both`) - `-treedb-vlog-auto-policy` TreeDB: value-log auto policy (`balanced|throughput|size`) +- `-treedb-vlog-rewrite-min-segment-age-ms` TreeDB: minimum source segment age for online generational rewrite (`0`=default) - `-treedb-vlog-dict-frame-encode-level` TreeDB: dict frame zstd encoder level (`engine|fastest|default|better|best|all|`) - `-treedb-vlog-dict-frame-entropy` TreeDB: dict frame entropy mode (`engine|on|off|both`) - `-seed` PRNG seed for randomized tests (default 1; `0` = time-based) diff --git a/cmd/unified_bench/adapter_treedb.go b/cmd/unified_bench/adapter_treedb.go index 016af5206..ae19a2359 100644 --- a/cmd/unified_bench/adapter_treedb.go +++ b/cmd/unified_bench/adapter_treedb.go @@ -70,6 +70,7 @@ var ( treedbVlogRewriteTriggerStaleRatioPPM = flag.Uint("treedb-vlog-rewrite-trigger-stale-ratio-ppm", 0, "TreeDB: generational rewrite stale/live trigger in ppm (0=disabled)") treedbVlogRewriteTriggerTotalBytes = flag.Int64("treedb-vlog-rewrite-trigger-total-bytes", 0, "TreeDB: generational rewrite total retained bytes trigger (0=disabled)") treedbVlogRewriteTriggerChurnPerSec = flag.Int64("treedb-vlog-rewrite-trigger-churn-per-sec", 0, "TreeDB: generational rewrite churn trigger in bytes/sec (0=disabled)") + treedbVlogRewriteMinSegmentAgeMS = flag.Int("treedb-vlog-rewrite-min-segment-age-ms", 0, "TreeDB: generational rewrite minimum source segment age in milliseconds (0=default)") treedbVlogBlockTargetBytes = flag.Int("treedb-vlog-block-target-bytes", 0, "TreeDB: value-log block target compressed bytes (0=default)") treedbVlogIncompressibleHoldBytes = flag.Int("treedb-vlog-incompressible-hold-bytes", 0, "TreeDB: auto-mode incompressible hold bytes (0=default)") treedbVlogIncompressibleProbeBytes = flag.Int("treedb-vlog-incompressible-probe-bytes", 0, "TreeDB: auto-mode incompressible probe interval bytes (0=default)") @@ -359,6 +360,11 @@ func (r treeDBOptionsReport) formatText(indent string) string { lines = append(lines, fmt.Sprintf("vlog.rewrite_trigger_stale_ratio_ppm=%d", r.opts.ValueLog.Generational.RewriteTriggerStaleRatioPPM)) lines = append(lines, fmt.Sprintf("vlog.rewrite_trigger_total_bytes=%d", r.opts.ValueLog.Generational.RewriteTriggerTotalBytes)) lines = append(lines, fmt.Sprintf("vlog.rewrite_trigger_churn_per_sec=%d", r.opts.ValueLog.Generational.RewriteTriggerChurnPerSec)) + if minAge := r.opts.ValueLog.Generational.RewriteMinSegmentAge; minAge <= 0 { + lines = append(lines, fmt.Sprintf("vlog.rewrite_min_segment_age_ms=default (effective=%d)", int((30*time.Second)/time.Millisecond))) + } else { + lines = append(lines, fmt.Sprintf("vlog.rewrite_min_segment_age_ms=%d", int(minAge/time.Millisecond))) + } if target := r.opts.ValueLog.BlockTargetCompressedBytes; target <= 0 { lines = append(lines, "vlog.block_target_bytes=default (effective=4096B)") } else { @@ -663,6 +669,7 @@ func buildTreeDBOptions(dir string) (treedb.Options, treeDBOptionsReport, error) opts.ValueLog.Generational.RewriteTriggerStaleRatioPPM = clampUint32(uint64(*treedbVlogRewriteTriggerStaleRatioPPM)) opts.ValueLog.Generational.RewriteTriggerTotalBytes = *treedbVlogRewriteTriggerTotalBytes opts.ValueLog.Generational.RewriteTriggerChurnPerSec = *treedbVlogRewriteTriggerChurnPerSec + opts.ValueLog.Generational.RewriteMinSegmentAge = time.Duration(*treedbVlogRewriteMinSegmentAgeMS) * time.Millisecond if maintenanceMode == "bench" { // Disable background maintenance loops. "bench" mode aims for stable diff --git a/cmd/unified_bench/adapter_treedb_vlog_test.go b/cmd/unified_bench/adapter_treedb_vlog_test.go index 3a2dbd9f5..f54948848 100644 --- a/cmd/unified_bench/adapter_treedb_vlog_test.go +++ b/cmd/unified_bench/adapter_treedb_vlog_test.go @@ -131,6 +131,23 @@ func TestBuildTreeDBOptions_VlogDictClassModeFlag(t *testing.T) { } } +func TestBuildTreeDBOptions_VlogRewriteMinSegmentAgeFlag(t *testing.T) { + saved := saveTreeDBFlagState() + defer restoreTreeDBFlagState(saved) + + *treedbVlogRewriteMinSegmentAgeMS = 5000 + opts, rep, err := buildTreeDBOptions("") + if err != nil { + t.Fatalf("buildTreeDBOptions: %v", err) + } + if got := opts.ValueLog.Generational.RewriteMinSegmentAge.Milliseconds(); got != 5000 { + t.Fatalf("unexpected rewrite min segment age ms: got=%d want=5000", got) + } + if got := rep.formatText(""); !strings.Contains(got, "vlog.rewrite_min_segment_age_ms=5000") { + t.Fatalf("resolved options missing rewrite min segment age: %q", got) + } +} + func TestBuildTreeDBOptions_InvalidVlogDictClassMode(t *testing.T) { saved := saveTreeDBFlagState() defer restoreTreeDBFlagState(saved) diff --git a/cmd/unified_bench/main.go b/cmd/unified_bench/main.go index d7737e6ed..51f4c76f7 100644 --- a/cmd/unified_bench/main.go +++ b/cmd/unified_bench/main.go @@ -1270,8 +1270,26 @@ func printTreeDBCacheStats(w io.Writer, inst *DBInstance, prefix string) { "treedb.cache.vlog_generation.policy", "treedb.cache.vlog_generation.scheduler_state", "treedb.cache.vlog_generation.scheduler_last_reason", + "treedb.cache.vlog_generation.maintenance_phase", + "treedb.cache.vlog_generation.maintenance.attempts", + "treedb.cache.vlog_generation.maintenance.acquired", + "treedb.cache.vlog_generation.maintenance.collisions", + "treedb.cache.vlog_generation.maintenance.skip.wal_on_periodic", + "treedb.cache.vlog_generation.maintenance.skip.maintenance_phase", + "treedb.cache.vlog_generation.maintenance.skip.stage_gate", + "treedb.cache.vlog_generation.maintenance.skip.stage_gate_not_due", + "treedb.cache.vlog_generation.maintenance.skip.stage_gate_due_reserved", + "treedb.cache.vlog_generation.maintenance.skip.age_blocked_gate", + "treedb.cache.vlog_generation.maintenance.skip.priority_pending", + "treedb.cache.vlog_generation.maintenance.skip.quiet_window", + "treedb.cache.vlog_generation.maintenance.skip.before_first_checkpoint", + "treedb.cache.vlog_generation.maintenance.skip.checkpoint_inflight", "treedb.cache.vlog_generation.churn_bytes_total", "treedb.cache.vlog_generation.churn_bytes_per_sec", + "treedb.cache.vlog_generation.rewrite_trigger.stale_ratio_ppm", + "treedb.cache.vlog_generation.rewrite_trigger.total_bytes", + "treedb.cache.vlog_generation.rewrite_trigger.churn_per_sec", + "treedb.cache.vlog_generation.rewrite.min_segment_age_ms", "treedb.cache.vlog_generation.bytes.live.total", "treedb.cache.vlog_generation.bytes.live.hot", "treedb.cache.vlog_generation.bytes.live.warm", @@ -1285,12 +1303,79 @@ func printTreeDBCacheStats(w io.Writer, inst *DBInstance, prefix string) { "treedb.cache.vlog_generation.segments.hot", "treedb.cache.vlog_generation.segments.warm", "treedb.cache.vlog_generation.segments.cold", + "treedb.cache.vlog_generation.rewrite.queue_len", + "treedb.cache.vlog_generation.rewrite.queue_loaded", + "treedb.cache.vlog_generation.rewrite.ledger_segments", + "treedb.cache.vlog_generation.rewrite.ledger_bytes_total", + "treedb.cache.vlog_generation.rewrite.ledger_bytes_live", + "treedb.cache.vlog_generation.rewrite.ledger_bytes_stale", + "treedb.cache.vlog_generation.rewrite.ledger_stale_ratio_ppm", + "treedb.cache.vlog_generation.rewrite.stage_pending", + "treedb.cache.vlog_generation.rewrite.stage_observed_unix_nano", + "treedb.cache.vlog_generation.rewrite.penalties_active", + "treedb.cache.vlog_generation.rewrite.age_blocked_until_unix_nano", + "treedb.cache.vlog_generation.rewrite.age_blocked_remaining_ms", + "treedb.cache.vlog_generation.rewrite.plan_runs", + "treedb.cache.vlog_generation.rewrite.plan_canceled", + "treedb.cache.vlog_generation.rewrite.plan_errors", + "treedb.cache.vlog_generation.rewrite.plan_empty", + "treedb.cache.vlog_generation.rewrite.plan_empty.age_blocked", + "treedb.cache.vlog_generation.rewrite.plan_empty.no_selection", + "treedb.cache.vlog_generation.rewrite.plan_selected", + "treedb.cache.vlog_generation.rewrite.plan_selected_segments_total", + "treedb.cache.vlog_generation.rewrite.plan_selected_bytes_total", + "treedb.cache.vlog_generation.rewrite.plan_selected_bytes_live", + "treedb.cache.vlog_generation.rewrite.plan_selected_bytes_stale", "treedb.cache.vlog_generation.rewrite.bytes_in", "treedb.cache.vlog_generation.rewrite.bytes_out", + "treedb.cache.vlog_generation.rewrite.value_records_copied", + "treedb.cache.vlog_generation.rewrite.value_bytes_copied", + "treedb.cache.vlog_generation.rewrite.leafref_records_copied", + "treedb.cache.vlog_generation.rewrite.leafref_bytes_copied", + "treedb.cache.vlog_generation.rewrite.reclaim_ratio", + "treedb.cache.vlog_generation.rewrite.output_ratio", + "treedb.cache.vlog_generation.rewrite.processed_stale_ratio", + "treedb.cache.vlog_generation.rewrite.exec.bytes_in_per_sec", + "treedb.cache.vlog_generation.rewrite.exec.bytes_out_per_sec", + "treedb.cache.vlog_generation.rewrite.exec.reclaimed_bytes_per_sec", + "treedb.cache.vlog_generation.rewrite.exec.reclaimed_vs_churn_ratio", + "treedb.cache.vlog_generation.rewrite.no_reclaim_runs", + "treedb.cache.vlog_generation.rewrite.no_reclaim_stale_bytes", + "treedb.cache.vlog_generation.rewrite.canceled_runs", + "treedb.cache.vlog_generation.rewrite.deadline_runs", + "treedb.cache.vlog_generation.rewrite.ineffective_runs", + "treedb.cache.vlog_generation.rewrite_budget.consumed_bytes_per_sec", + "treedb.cache.vlog_generation.rewrite_budget.consumed_share_of_budget_pct", + "treedb.cache.vlog_generation.rewrite_budget.bytes_per_sec", + "treedb.cache.vlog_generation.rewrite_budget.records_per_sec", + "treedb.cache.vlog_generation.rewrite_budget.tokens_bytes", + "treedb.cache.vlog_generation.rewrite_budget.tokens_cap_bytes", + "treedb.cache.vlog_generation.rewrite_budget.tokens_utilization_pct", + "treedb.cache.vlog_generation.rewrite_budget.consumed_bytes_total", "treedb.cache.vlog_generation.rewrite.runs", "treedb.cache.vlog_generation.gc.deleted_segments", "treedb.cache.vlog_generation.gc.deleted_bytes", + "treedb.cache.vlog_generation.gc.last_observed_source.segments", + "treedb.cache.vlog_generation.gc.last_observed_source.segments_referenced", + "treedb.cache.vlog_generation.gc.last_observed_source.segments_eligible", + "treedb.cache.vlog_generation.gc.last_observed_source.segments_pending", + "treedb.cache.vlog_generation.gc.last_observed_source.segments_protected_retained", + "treedb.cache.vlog_generation.gc.last_observed_source.segments_protected_in_use", + "treedb.cache.vlog_generation.gc.last_observed_source.bytes", + "treedb.cache.vlog_generation.gc.last_observed_source.bytes_referenced", + "treedb.cache.vlog_generation.gc.last_observed_source.bytes_eligible", + "treedb.cache.vlog_generation.gc.last_observed_source.bytes_pending", + "treedb.cache.vlog_generation.gc.last_observed_source.bytes_protected_retained", + "treedb.cache.vlog_generation.gc.last_observed_source.bytes_protected_in_use", "treedb.cache.vlog_generation.gc.runs", + "treedb.cache.vlog_retained_prune.runs", + "treedb.cache.vlog_retained_prune.forced_runs", + "treedb.cache.vlog_retained_prune.removed_segments", + "treedb.cache.vlog_retained_prune.removed_bytes", + "treedb.cache.vlog_retained_prune.live_skipped_segments", + "treedb.cache.vlog_retained_prune.live_skipped_bytes", + "treedb.cache.vlog_retained_prune.zombie_marked_segments", + "treedb.cache.vlog_retained_prune.zombie_marked_bytes", "treedb.cache.vlog_generation.vacuum.runs", "treedb.cache.vlog_generation.vacuum.failures", "treedb.cache.vlog_generation.remap.successes", diff --git a/cmd/unified_bench/profiles_treedb_index_test.go b/cmd/unified_bench/profiles_treedb_index_test.go index 80720f73c..36a897562 100644 --- a/cmd/unified_bench/profiles_treedb_index_test.go +++ b/cmd/unified_bench/profiles_treedb_index_test.go @@ -165,6 +165,7 @@ type savedTreeDBFlagState struct { vlogGenColdBytes int64 vlogRewriteBudgetBPS int64 vlogRewriteBudgetRPS int + vlogRewriteMinAgeMS int disableWAL bool relaxedSync bool disableChecksum bool @@ -197,6 +198,7 @@ func saveTreeDBFlagState() savedTreeDBFlagState { vlogGenColdBytes: *treedbVlogGenerationColdSegmentBytes, vlogRewriteBudgetBPS: *treedbVlogRewriteBudgetBytesPerSec, vlogRewriteBudgetRPS: *treedbVlogRewriteBudgetRecordsPerSec, + vlogRewriteMinAgeMS: *treedbVlogRewriteMinSegmentAgeMS, disableWAL: *treedbDisableWAL, relaxedSync: *treedbRelaxedSync, disableChecksum: *treedbDisableReadChecksum, @@ -225,6 +227,7 @@ func restoreTreeDBFlagState(s savedTreeDBFlagState) { *treedbVlogGenerationColdSegmentBytes = s.vlogGenColdBytes *treedbVlogRewriteBudgetBytesPerSec = s.vlogRewriteBudgetBPS *treedbVlogRewriteBudgetRecordsPerSec = s.vlogRewriteBudgetRPS + *treedbVlogRewriteMinSegmentAgeMS = s.vlogRewriteMinAgeMS *treedbDisableWAL = s.disableWAL *treedbRelaxedSync = s.relaxedSync *treedbDisableReadChecksum = s.disableChecksum @@ -252,6 +255,7 @@ func resetTreeDBIndexFlagsForTest() { *treedbVlogGenerationColdSegmentBytes = 0 *treedbVlogRewriteBudgetBytesPerSec = 0 *treedbVlogRewriteBudgetRecordsPerSec = 0 + *treedbVlogRewriteMinSegmentAgeMS = 0 *treedbDisableWAL = false *treedbRelaxedSync = false *treedbDisableReadChecksum = false diff --git a/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md b/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md index 052bd806d..648120a26 100644 --- a/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md +++ b/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md @@ -10,6 +10,7 @@ - `-treedb-vlog-generation-policy hot_warm_cold` - `-treedb-vlog-rewrite-trigger-total-bytes` set for your dataset size - `-treedb-vlog-rewrite-budget-bytes-per-sec` and/or `-treedb-vlog-rewrite-budget-records-per-sec` +- `-treedb-vlog-rewrite-min-segment-age-ms` keep default for production; lower only for short-loop experiments ## Maintenance Model - Rewrite: threshold-triggered and budget-bounded. @@ -23,6 +24,12 @@ Primary keys: - `treedb.cache.vlog_generation.scheduler_state` - `treedb.cache.vlog_generation.scheduler_last_reason` - `treedb.cache.vlog_generation.churn_bytes_per_sec` +- `treedb.cache.vlog_generation.rewrite.min_segment_age_ms` +- `treedb.cache.vlog_generation.rewrite.plan_runs` +- `treedb.cache.vlog_generation.rewrite.plan_empty` +- `treedb.cache.vlog_generation.rewrite.plan_empty.age_blocked` +- `treedb.cache.vlog_generation.rewrite.plan_selected` +- `treedb.cache.vlog_generation.rewrite.ledger_bytes_stale` - `treedb.cache.vlog_generation.rewrite.runs` - `treedb.cache.vlog_generation.rewrite.bytes_in` - `treedb.cache.vlog_generation.rewrite.bytes_out` @@ -32,6 +39,83 @@ Primary keys: - `treedb.cache.vlog_generation.vacuum.runs` - `treedb.cache.vlog_generation.vacuum.failures` +## Live Run Capacity Report +For `run_celestia`-style runs, analyze the latest diagnostics snapshot with: + +```bash +./scripts/analyze_vlog_maintenance_capacity.py +``` + +Optional explicit input: + +```bash +./scripts/analyze_vlog_maintenance_capacity.py ~/.celestia-app-mainnet-treedb- +./scripts/analyze_vlog_maintenance_capacity.py ~/.celestia-app-mainnet-treedb-/sync/diagnostics/.debug_vars.json +``` + +The report highlights: +- maintenance lane pressure (attempt/acquire/collision + skip mix) +- rewrite plan-to-exec realization +- rewrite source outcomes (requested vs still-referenced vs unreferenced) +- stale-bytes processed vs immediate reclaim +- observed-source replay drain +- observed-source retained-prune outcomes (candidate/live-skipped/zombie-marked/removed) +- zombie inventory (pinned vs unpinned bytes) +- GC eligibility/protection signals + +## Interleaved A/B Harness +For sync+rewrite tradeoff validation, use the interleaved harness: + +```bash +cat >/tmp/cel_control.env <<'EOF' +LOCAL_GOMAP_DIR=/path/to/control/gomap +TREEDB_OPEN_PROFILE=fast +EOF + +cat >/tmp/cel_candidate.env <<'EOF' +LOCAL_GOMAP_DIR=/path/to/candidate/gomap +TREEDB_OPEN_PROFILE=fast +EOF + +CONTROL_ENV_FILE=/tmp/cel_control.env \ +CANDIDATE_ENV_FILE=/tmp/cel_candidate.env \ +MAX_PAIRS=10 \ +MIN_PAIRS=4 \ +CLEAR_WIN_PAIRS=3 \ +CLEAR_LOSS_PAIRS=3 \ +./scripts/run_celestia_ab.sh +``` + +Default pair metric focus: +- `T_sync`: sync duration (seconds) +- `S_sync_app`: app dir bytes at sync end +- `S_sync_wal`: `application.db/maindb/wal` bytes at sync end +- `T_rw`: offline `vlog-rewrite` wall time +- `S_post_wal`: WAL bytes after offline rewrite +- `T_total = T_sync + T_rw` +- `max_rss_kb` (memory guardrail) + +Outputs: +- `artifacts/celestia_ab//runs.csv` +- `artifacts/celestia_ab//pairs.csv` +- `artifacts/celestia_ab//summary.md` +- per-run JSON under `artifacts/celestia_ab//runs/*/run.json` + +The harness alternates run order per pair (`control->candidate`, then +`candidate->control`) and can stop early on clear win/loss signals. + +## Experimental Knob +- `TREEDB_ENABLE_VLOG_GENERATION_PRECHECKPOINT_REWRITE=1` + - WAL-off only. + - Allows rewrite planning/execution before the first explicit checkpoint. + - Default is disabled to avoid adding early restore contention. + - Use for controlled `run_celestia` experiments when `maintenance.skip.before_first_checkpoint` dominates and live rewrite never starts. +- `TREEDB_ENABLE_VLOG_GENERATION_CHECKPOINT_KICK_HOT_DEBT_ONLY=1` + - WAL-off only. + - During checkpoint-kick maintenance, skips starting a fresh rewrite plan while foreground activity is hot and rewrite queue debt is empty. + - Still allows queued rewrite debt (and deferred-due passes) to run. + - Default is disabled. + ## Bench Commands ### Churn sanity (TreeDB) ```bash diff --git a/docs/benchmarks/CELESTIA_CHECKPOINT_KICK_HOT_DEBT_ONLY_2026-03-28.md b/docs/benchmarks/CELESTIA_CHECKPOINT_KICK_HOT_DEBT_ONLY_2026-03-28.md new file mode 100644 index 000000000..2e40ec607 --- /dev/null +++ b/docs/benchmarks/CELESTIA_CHECKPOINT_KICK_HOT_DEBT_ONLY_2026-03-28.md @@ -0,0 +1,89 @@ +# Celestia: Checkpoint-Kick Hot-Debt-Only Gate (2026-03-28) + +## Goal +Reduce `run_celestia` sync wall-time regression from live value-log maintenance while preserving on-disk size gains. + +## Change Under Test +Candidate enables: + +- `TREEDB_ENABLE_VLOG_GENERATION_CHECKPOINT_KICK_HOT_DEBT_ONLY=1` + +Behavior: + +- In WAL-off checkpoint-kick path, if foreground is hot and rewrite queue is empty, skip starting a fresh rewrite plan. +- Queued rewrite debt and deferred-due maintenance still run. +- Default behavior remains unchanged unless this env flag is set. + +## Commands +Both campaigns used fixed trust/target and a single interleaved pair (`MAX_PAIRS=1`) with offline rewrite enabled. + +Common env (both variants): + +- `TREEDB_OPEN_PROFILE=fast` +- `POLL_INTERVAL_SECONDS=1` +- `FREEZE_REMOTE_HEIGHT_AT_START=1` +- `ALLOW_CLAMPED_TARGET_EARLY_EXIT=1` +- `STOP_AT_LOCAL_HEIGHT=` +- `TRUST_HEIGHT=` +- `TRUST_HASH=` + +Variant-specific env: + +- `main`: `LOCAL_GOMAP_DIR=/tmp/gomap_ab_base_20260328162444` +- `hot_debt_only`: `LOCAL_GOMAP_DIR=/home/mikers/dev/snissn/gomap-phasehook-active` + `TREEDB_ENABLE_VLOG_GENERATION_CHECKPOINT_KICK_HOT_DEBT_ONLY=1` + +Harness: + +```bash +OUT_DIR= \ +CONTROL_ENV_FILE= \ +CANDIDATE_ENV_FILE= \ +MAX_PAIRS=1 MIN_PAIRS=1 CLEAR_WIN_PAIRS=1 CLEAR_LOSS_PAIRS=1 \ +LOW_SIGNAL_MIN_PAIRS=1 LOW_SIGNAL_NEUTRAL_STREAK=1 \ +SIZE_TOLERANCE_BYTES=$((64<<20)) TIME_TOLERANCE_SECONDS=120 \ +REWRITE_ENABLED=1 \ +./scripts/run_celestia_ab.sh +``` + +## Runs +- control=main, candidate=hot_debt_only: + - `/tmp/celestia_ab_hotdebt_20260328171204` +- control=hot_debt_only, candidate=main (swapped to counter order bias): + - `/tmp/celestia_ab_hotdebt_swapped_20260328172453` + +## Normalized Results (hot_debt_only - main) +- Run A (hot_debt_only as candidate): + - `delta_t_sync_seconds = -16` + - `delta_t_total_seconds = -17` + - `delta_s_sync_app_bytes = -694,418,294` + - `delta_s_post_wal_bytes = +3,315,722` +- Run B (hot_debt_only as control, normalized): + - `delta_t_sync_seconds = +3` + - `delta_t_total_seconds = +2` + - `delta_s_sync_app_bytes = -98,696,592` + - `delta_s_post_wal_bytes = -3,665,002` + +Two-run median/average (same with n=2): + +- `delta_t_sync_seconds = -6.5s` +- `delta_t_total_seconds = -7.5s` +- `delta_s_sync_app_bytes = -396,557,443B` (~`-378.2 MiB`) +- `delta_s_post_wal_bytes = -174,640B` (~`-170.5 KiB`, effectively neutral) + +## Maintenance Counters +Across both runs, both variants showed: + +- `rewrite_runs=0` +- `checkpoint_kick_runs=0` + +Candidate (`hot_debt_only`) showed one lightweight GC pass in each run (`gc_runs=1`), with no rewrite execution. + +## Takeaway +The hot-debt-only gate removed checkpoint-kick rewrite pressure during hot sync windows and improved sync+rewrite wall time in this small sample, while keeping pre-rewrite app size better than main and post-rewrite WAL roughly neutral. + +## Next Step +Run an interleaved sequence with more pairs (stop-on-significance) and include the new stat key: + +- `treedb.cache.vlog_generation.checkpoint_kick.skipped_hot_no_debt` + +to confirm skip path activation frequency under full mainnet sync pressure. diff --git a/docs/benchmarks/CELESTIA_COMPRESSION_LOOP.md b/docs/benchmarks/CELESTIA_COMPRESSION_LOOP.md new file mode 100644 index 000000000..25ba010c0 --- /dev/null +++ b/docs/benchmarks/CELESTIA_COMPRESSION_LOOP.md @@ -0,0 +1,118 @@ +# Celestia Compression Iteration Loop + +This loop exists to avoid slow, low-signal experimentation. + +Primary objective: +- Reduce on-disk `application.db` bytes. + +Secondary objectives: +- Keep combined wall time (`sync + rewrite`) bounded. +- Avoid memory regressions (`max_rss`). +- Keep gzip as a sanity check, not the primary objective. + +## Stage 0: Hypothesis Contract (Required) + +Before running anything expensive, define: +- hypothesis: what changed and why it should help +- expected effect size: minimum size delta worth promoting +- time budget: max acceptable wall-time regression +- rollback condition: what result means we stop and redesign + +If expected effect size is below threshold, do not run full `run_celestia` yet. + +## Stage 1: Fast Gate (Default Iteration Loop) + +Use `scripts/celestia_fast_gate.sh` for fast interleaved control/candidate A/B. + +What it measures per run: +- pre-rewrite size: `sync_app`, `sync_wal`, optional `sync_gzip` +- post-rewrite size: `post_app`, `post_wal`, optional `post_gzip` +- timing: benchmark duration + rewrite duration + total +- throughput: batch-write ops/sec from unified-bench output + +Defaults chosen for celestia-like pressure: +- `-profile fast` +- `-val-pattern celestia_height_prefix_fill` +- dict compression enabled +- dict defaults passed explicitly: + - `-treedb-vlog-dict-train-bytes=1048576` + - `-treedb-vlog-dict-dict-bytes=32768` + +Fast-gate anti-loop safeguards: +- interleaved order alternates each pair (bias reduction) +- early clear stop (improvement/regression) +- futility stop when remaining pairs cannot reach a clear decision +- low-signal stop on neutral-streak threshold +- per-run process review artifact (`process_review.md`) + +Example: + +```bash +MAX_PAIRS=6 \ +MIN_PAIRS=3 \ +CLEAR_WIN_PAIRS=2 \ +CLEAR_LOSS_PAIRS=2 \ +LOW_SIGNAL_MIN_PAIRS=3 \ +LOW_SIGNAL_NEUTRAL_STREAK=3 \ +SIZE_FIELD=s_post_app_bytes \ +SIZE_TOLERANCE_BYTES=$((64<<20)) \ +TIME_TOLERANCE_SECONDS=30 \ +./scripts/celestia_fast_gate.sh +``` + +Outputs: +- `summary.md` +- `process_review.md` +- `runs.csv` +- `pairs.csv` +- per-run `run.json` + +## Stage 2: Pprof/Implementation Efficiency Pass + +Run this stage before full `run_celestia` if fast gate shows: +- promising size gains with time regression, or +- ambiguous neutral outcomes near threshold. + +Goal: +- remove avoidable implementation overhead (copying/alloc/lock contention) +- preserve size gains while pulling time back inside budget + +## Stage 3: Full `run_celestia` A/B Confirmation + +Only promote candidates that pass Stage 1 and Stage 2. + +Use `scripts/run_celestia_ab.sh` with interleaved pairs and stop rules. + +Now includes anti-loop safeguards: +- clear stop (improvement/regression) +- futility stop (`futile_remaining_pairs`) +- low-signal neutral-streak stop (`low_signal_neutral_streak`) + +Example: + +```bash +MAX_PAIRS=4 \ +MIN_PAIRS=3 \ +CLEAR_WIN_PAIRS=2 \ +CLEAR_LOSS_PAIRS=2 \ +LOW_SIGNAL_MIN_PAIRS=3 \ +LOW_SIGNAL_NEUTRAL_STREAK=3 \ +REWRITE_ENABLED=1 \ +./scripts/run_celestia_ab.sh +``` + +## Process Review Cadence + +Review and revise the loop after every decision event: +- `clear_improvement` +- `clear_regression` +- `futile_remaining_pairs` +- `low_signal_neutral_streak` + +Required review questions: +- Was the fast gate predictive of full-run direction? +- Were thresholds too strict or too loose for current goals? +- Did we spend time validating changes below meaningful effect size? +- Is the next candidate large enough to justify promotion? + +If two consecutive campaigns end in low-signal/futility, tighten promotion gates and bundle larger candidate deltas before next full run. diff --git a/docs/benchmarks/VLOG_REWRITE_MIN_SEGMENT_AGE_SWEEP_2026-03-28.md b/docs/benchmarks/VLOG_REWRITE_MIN_SEGMENT_AGE_SWEEP_2026-03-28.md new file mode 100644 index 000000000..6f3608eb7 --- /dev/null +++ b/docs/benchmarks/VLOG_REWRITE_MIN_SEGMENT_AGE_SWEEP_2026-03-28.md @@ -0,0 +1,72 @@ +# VLOG Rewrite Min-Segment-Age Sweep (2026-03-28) + +## Goal + +Evaluate whether lowering online rewrite min-segment-age improves short-loop +signal without harming sync-time or end-of-run app-dir size. + +## Workload + +- Command core: + - `./bin/unified-bench` + - `-profile fast` + - `-dbs treedb` + - `-keys 900000` + - `-valsize 256` + - `-batchsize 4000` + - `-test batch_write_steady,random_write` + - `-val-pattern celestia_height_prefix_fill` + - `-checkpoint-every-bytes 4194304` + - `-treedb-force-value-pointers=true` + - `-treedb-vlog-compression dict` + - `-treedb-vlog-compression-autotune aggressive` + - `-treedb-vlog-generation-policy hot_warm_cold` + - `-treedb-vlog-rewrite-trigger-total-bytes 1` + - `-treedb-vlog-rewrite-trigger-stale-ratio-ppm 1` + - `-treedb-vlog-rewrite-trigger-churn-per-sec 1` + - `-treedb-vlog-rewrite-budget-bytes-per-sec 134217728` + - `-treedb-cache-stats-after-tests=true` + +- Swept: + - default (effective 30000ms) + - `-treedb-vlog-rewrite-min-segment-age-ms 1000` + - `-treedb-vlog-rewrite-min-segment-age-ms 5000` + - `-treedb-vlog-rewrite-min-segment-age-ms 10000` + +## Results + +| min age | rewrite activity | dir bytes | wal bytes | note | +|---|---:|---:|---:|---| +| default (30000ms) | rewrite_runs=0, plan_empty.age_blocked=1 | 567,439,668 | 553,889,306 | baseline behavior | +| 1000ms | rewrite_runs=1, plan_selected=1, gc_runs=1 | 702,734,421 | 685,611,243 | clear regression | +| 5000ms | rewrite_runs=0, plan_empty.age_blocked=1 | 567,406,884 | 553,889,290 | effectively baseline | +| 10000ms | rewrite_runs=0, plan_empty.age_blocked=1 | 567,439,650 | 553,889,288 | effectively baseline | + +Observed for the regressing 1000ms run: + +- `rewrite.bytes_in` ~= 64MB +- `rewrite.bytes_out` ~= 528MB +- `rewrite.reclaim_ratio` = `0.000000` +- `gc.deleted_segments` = `0` + +Interpretation: rewrite executes too early and amplifies bytes without reclaim, +so this setting is not suitable for production-like loops. + +## Interleaved A/B confirmation + +Using `scripts/celestia_fast_gate.sh` with same binaries and only this flag as +candidate delta (`CANDIDATE_EXTRA_FLAGS='-treedb-vlog-rewrite-min-segment-age-ms 1'`): + +- Output: `/tmp/gomap_minage_gate_ctr4Ji/gate` +- Decision: `clear_regression` +- Completed pairs: 2 +- Median delta (`candidate - control`): + - `s_sync_app_bytes`: +135,580,501.5 + - `t_sync_seconds`: +13 + +## Conclusion + +- Keep default min-segment-age for normal runs. +- Keep the flag as an explicit lab-only override for controlled scheduler + experiments. +- Do not enable low values (1ms/1000ms) in gate/default configs. diff --git a/scripts/analyze_vlog_maintenance_capacity.py b/scripts/analyze_vlog_maintenance_capacity.py new file mode 100755 index 000000000..b5b6eabc0 --- /dev/null +++ b/scripts/analyze_vlog_maintenance_capacity.py @@ -0,0 +1,790 @@ +#!/usr/bin/env python3 +"""Summarize live TreeDB vlog maintenance capacity from run_celestia diagnostics. + +Input can be: +- a run home dir (e.g. ~/.celestia-app-mainnet-treedb-YYYY...) +- a diagnostics dir +- a debug vars JSON file + +By default, the script scans the newest ~/.celestia-app-mainnet-treedb-* home. +""" + +from __future__ import annotations + +import argparse +import glob +import json +import math +import os +import sys +from pathlib import Path +from typing import Any + + +def human_bytes(value: float) -> str: + if value is None or math.isnan(value): + return "n/a" + n = float(value) + if n < 0: + return f"-{human_bytes(-n)}" + units = ["B", "KiB", "MiB", "GiB", "TiB"] + idx = 0 + while n >= 1024.0 and idx < len(units) - 1: + n /= 1024.0 + idx += 1 + if idx == 0: + return f"{int(n)} {units[idx]}" + return f"{n:.2f} {units[idx]}" + + +def pct(num: float, den: float) -> float: + if den <= 0: + return 0.0 + return 100.0 * num / den + + +def safe_int(value: Any, default: int = 0) -> int: + if isinstance(value, bool): + return int(value) + if isinstance(value, int): + return value + if isinstance(value, float): + return int(value) + if isinstance(value, str): + s = value.strip().lower() + if not s: + return default + if s == "true": + return 1 + if s == "false": + return 0 + try: + return int(s) + except ValueError: + try: + return int(float(s)) + except ValueError: + return default + return default + + +def safe_float(value: Any, default: float = 0.0) -> float: + if isinstance(value, bool): + return float(int(value)) + if isinstance(value, (int, float)): + return float(value) + if isinstance(value, str): + s = value.strip().lower() + if not s: + return default + if s == "true": + return 1.0 + if s == "false": + return 0.0 + try: + return float(s) + except ValueError: + return default + return default + + +def pick_latest(paths: list[Path]) -> Path | None: + if not paths: + return None + return max(paths, key=lambda p: p.stat().st_mtime) + + +def find_latest_home() -> Path | None: + homes: list[Path] = [] + for raw in glob.glob(os.path.expanduser("~/.celestia-app-mainnet-treedb-*")): + p = Path(raw) + if p.is_dir(): + homes.append(p) + return pick_latest(homes) + + +def find_diagnostics_file(root: Path) -> Path | None: + roots: list[Path] = [] + if (root / "sync" / "diagnostics").is_dir(): + roots.append(root / "sync" / "diagnostics") + if (root / "diagnostics").is_dir(): + roots.append(root / "diagnostics") + if root.is_dir() and root.name == "diagnostics": + roots.append(root) + + patterns = ["*.debug_vars.json", "*.treedb_vars.json", "*.treedb_application_vars.json"] + + # Prefer richer payload shapes in order. Ignore obviously empty snapshots. + for pat in patterns: + candidates: list[Path] = [] + for diag in roots: + candidates.extend(diag.glob(pat)) + # If caller passed a file-like path prefix directory with JSON files only. + if root.is_dir() and not roots: + candidates.extend(root.glob(pat)) + candidates.sort(key=lambda p: p.stat().st_mtime, reverse=True) + for cand in candidates: + # "{}\n" snapshots are not useful for maintenance analysis. + if cand.stat().st_size <= 4: + continue + return cand + + # Fallback: if all snapshots are tiny/empty, still return the newest one. + fallback: list[Path] = [] + for pat in patterns: + for diag in roots: + fallback.extend(diag.glob(pat)) + if root.is_dir() and not roots: + fallback.extend(root.glob(pat)) + return pick_latest(fallback) + + +def find_home_from_path(path: Path) -> str: + for parent in [path] + list(path.parents): + name = parent.name + if name.startswith(".celestia-app-mainnet-"): + return str(parent) + return "" + + +def choose_instance(instances: dict[str, Any], pattern: str) -> tuple[str, dict[str, Any]]: + if not instances: + return "", {} + + if pattern: + matches = [(k, v) for k, v in instances.items() if pattern in k and isinstance(v, dict)] + if matches: + # Prefer the richest stats object among matches. + matches.sort(key=lambda kv: len(kv[1]), reverse=True) + return matches[0][0], matches[0][1] + + scored: list[tuple[int, int, str, dict[str, Any]]] = [] + for k, v in instances.items(): + if not isinstance(v, dict): + continue + vg_count = sum(1 for key in v.keys() if str(key).startswith("treedb.cache.vlog_generation.")) + scored.append((vg_count, len(v), k, v)) + if scored: + scored.sort(reverse=True) + _, _, k, v = scored[0] + return k, v + + first_key = sorted(instances.keys())[0] + val = instances[first_key] + if isinstance(val, dict): + return first_key, val + return first_key, {} + + +def extract_stats(payload: Any, instance_pattern: str) -> tuple[dict[str, Any], str]: + if not isinstance(payload, dict): + return {}, "" + + # Most complete shape from debug vars snapshots: + # { "treedb": { "instances": { "...": { stats... } } } } + treedb = payload.get("treedb") + if isinstance(treedb, dict): + instances = treedb.get("instances") + if isinstance(instances, dict): + instance_name, stats = choose_instance(instances, instance_pattern) + return stats, instance_name + + # Flat stats map shape. + if any(str(k).startswith("treedb.cache.") for k in payload.keys()): + return payload, "" + + # Other possible shape: top-level instances. + instances = payload.get("instances") + if isinstance(instances, dict): + instance_name, stats = choose_instance(instances, instance_pattern) + return stats, instance_name + + return {}, "" + + +def metric_int(stats: dict[str, Any], key: str) -> int: + return safe_int(stats.get(key, 0), 0) + + +def metric_float(stats: dict[str, Any], key: str) -> float: + return safe_float(stats.get(key, 0.0), 0.0) + + +def build_summary(stats: dict[str, Any]) -> dict[str, Any]: + m = { + "maintenance_attempts": metric_int(stats, "treedb.cache.vlog_generation.maintenance.attempts"), + "maintenance_acquired": metric_int(stats, "treedb.cache.vlog_generation.maintenance.acquired"), + "maintenance_collisions": metric_int(stats, "treedb.cache.vlog_generation.maintenance.collisions"), + "maintenance_noop": metric_int(stats, "treedb.cache.vlog_generation.maintenance.passes.noop"), + "maintenance_with_rewrite": metric_int(stats, "treedb.cache.vlog_generation.maintenance.passes.with_rewrite"), + "maintenance_with_gc": metric_int(stats, "treedb.cache.vlog_generation.maintenance.passes.with_gc"), + "rewrite_runs": metric_int(stats, "treedb.cache.vlog_generation.rewrite.runs"), + "rewrite_plan_runs": metric_int(stats, "treedb.cache.vlog_generation.rewrite.plan_runs"), + "rewrite_plan_selected": metric_int(stats, "treedb.cache.vlog_generation.rewrite.plan_selected"), + "rewrite_plan_empty": metric_int(stats, "treedb.cache.vlog_generation.rewrite.plan_empty"), + "rewrite_plan_empty_no_selection": metric_int(stats, "treedb.cache.vlog_generation.rewrite.plan_empty.no_selection"), + "rewrite_plan_empty_age_blocked": metric_int(stats, "treedb.cache.vlog_generation.rewrite.plan_empty.age_blocked"), + "rewrite_plan_selected_segments_total": metric_int(stats, "treedb.cache.vlog_generation.rewrite.plan_selected_segments_total"), + "rewrite_plan_penalty_filter_runs": metric_int(stats, "treedb.cache.vlog_generation.rewrite.plan_penalty_filter.runs"), + "rewrite_plan_penalty_filter_segments": metric_int(stats, "treedb.cache.vlog_generation.rewrite.plan_penalty_filter.segments"), + "rewrite_plan_penalty_filter_to_empty_runs": metric_int(stats, "treedb.cache.vlog_generation.rewrite.plan_penalty_filter.to_empty_runs"), + "rewrite_exec_source_segments_total": metric_int(stats, "treedb.cache.vlog_generation.rewrite.exec.source_segments_total"), + "rewrite_exec_source_segments_requested_total": metric_int(stats, "treedb.cache.vlog_generation.rewrite.exec.source_segments_requested_total"), + "rewrite_exec_source_segments_still_referenced_total": metric_int(stats, "treedb.cache.vlog_generation.rewrite.exec.source_segments_still_referenced_total"), + "rewrite_exec_source_segments_unreferenced_total": metric_int(stats, "treedb.cache.vlog_generation.rewrite.exec.source_segments_unreferenced_total"), + "rewrite_exec_source_segments_requested_last": metric_int(stats, "treedb.cache.vlog_generation.rewrite.exec.source_segments_requested_last"), + "rewrite_exec_source_segments_still_referenced_last": metric_int(stats, "treedb.cache.vlog_generation.rewrite.exec.source_segments_still_referenced_last"), + "rewrite_exec_source_segments_unreferenced_last": metric_int(stats, "treedb.cache.vlog_generation.rewrite.exec.source_segments_unreferenced_last"), + "rewrite_plan_selected_bytes_stale": metric_int(stats, "treedb.cache.vlog_generation.rewrite.plan_selected_bytes_stale"), + "rewrite_processed_stale_bytes": metric_int(stats, "treedb.cache.vlog_generation.rewrite.processed_stale_bytes"), + "rewrite_processed_live_bytes": metric_int(stats, "treedb.cache.vlog_generation.rewrite.processed_live_bytes"), + "rewrite_bytes_in": metric_int(stats, "treedb.cache.vlog_generation.rewrite.bytes_in"), + "rewrite_bytes_out": metric_int(stats, "treedb.cache.vlog_generation.rewrite.bytes_out"), + "rewrite_reclaimed_bytes": metric_int(stats, "treedb.cache.vlog_generation.rewrite.reclaimed_bytes"), + "rewrite_no_reclaim_runs": metric_int(stats, "treedb.cache.vlog_generation.rewrite.no_reclaim_runs"), + "rewrite_exec_total_ms": metric_float(stats, "treedb.cache.vlog_generation.rewrite.exec.total_ms"), + "rewrite_exec_avg_ms": metric_float(stats, "treedb.cache.vlog_generation.rewrite.exec.avg_ms"), + "rewrite_ledger_bytes_total": metric_int(stats, "treedb.cache.vlog_generation.rewrite.ledger_bytes_total"), + "rewrite_ledger_bytes_stale": metric_int(stats, "treedb.cache.vlog_generation.rewrite.ledger_bytes_stale"), + "rewrite_ledger_segments": metric_int(stats, "treedb.cache.vlog_generation.rewrite.ledger_segments"), + "rewrite_age_blocked_remaining_ms": metric_int(stats, "treedb.cache.vlog_generation.rewrite.age_blocked_remaining_ms"), + "rewrite_penalties_active": metric_int(stats, "treedb.cache.vlog_generation.rewrite.penalties_active"), + "rewrite_budget_consumed_bytes_total": metric_int(stats, "treedb.cache.vlog_generation.rewrite_budget.consumed_bytes_total"), + "rewrite_budget_tokens_utilization_pct": metric_float(stats, "treedb.cache.vlog_generation.rewrite_budget.tokens_utilization_pct"), + "gc_runs": metric_int(stats, "treedb.cache.vlog_generation.gc.runs"), + "gc_deleted_bytes": metric_int(stats, "treedb.cache.vlog_generation.gc.deleted_bytes"), + "gc_deleted_segments": metric_int(stats, "treedb.cache.vlog_generation.gc.deleted_segments"), + "gc_exec_total_ms": metric_float(stats, "treedb.cache.vlog_generation.gc.exec.total_ms"), + "gc_exec_avg_ms": metric_float(stats, "treedb.cache.vlog_generation.gc.exec.avg_ms"), + "gc_last_eligible_bytes": metric_int(stats, "treedb.cache.vlog_generation.gc.last_eligible_bytes"), + "gc_last_pending_bytes": metric_int(stats, "treedb.cache.vlog_generation.gc.last_pending_bytes"), + "gc_last_protected_retained_bytes": metric_int(stats, "treedb.cache.vlog_generation.gc.last_protected_retained_bytes"), + "retained_prune_closed_bytes": metric_int(stats, "treedb.cache.vlog_retained_prune.closed_bytes"), + "retained_prune_runs": metric_int(stats, "treedb.cache.vlog_retained_prune.runs"), + "retained_prune_forced_runs": metric_int(stats, "treedb.cache.vlog_retained_prune.forced_runs"), + "retained_prune_candidate_segments": metric_int(stats, "treedb.cache.vlog_retained_prune.candidate_segments"), + "retained_prune_candidate_bytes": metric_int(stats, "treedb.cache.vlog_retained_prune.candidate_bytes"), + "retained_prune_removed_segments": metric_int(stats, "treedb.cache.vlog_retained_prune.removed_segments"), + "retained_prune_removed_bytes": metric_int(stats, "treedb.cache.vlog_retained_prune.removed_bytes"), + "retained_prune_in_use_skipped_segments": metric_int(stats, "treedb.cache.vlog_retained_prune.in_use_skipped_segments"), + "retained_prune_in_use_skipped_bytes": metric_int(stats, "treedb.cache.vlog_retained_prune.in_use_skipped_bytes"), + "retained_prune_live_skipped_segments": metric_int(stats, "treedb.cache.vlog_retained_prune.live_skipped_segments"), + "retained_prune_live_skipped_bytes": metric_int(stats, "treedb.cache.vlog_retained_prune.live_skipped_bytes"), + "retained_prune_zombie_marked_segments": metric_int(stats, "treedb.cache.vlog_retained_prune.zombie_marked_segments"), + "retained_prune_zombie_marked_bytes": metric_int(stats, "treedb.cache.vlog_retained_prune.zombie_marked_bytes"), + "vlog_zombie_segments": metric_int(stats, "treedb.cache.vlog_zombie.segments"), + "vlog_zombie_bytes": metric_int(stats, "treedb.cache.vlog_zombie.bytes"), + "vlog_zombie_pinned_segments": metric_int(stats, "treedb.cache.vlog_zombie.pinned_segments"), + "vlog_zombie_pinned_bytes": metric_int(stats, "treedb.cache.vlog_zombie.pinned_bytes"), + "vlog_zombie_unpinned_segments": metric_int(stats, "treedb.cache.vlog_zombie.unpinned_segments"), + "vlog_zombie_unpinned_bytes": metric_int(stats, "treedb.cache.vlog_zombie.unpinned_bytes"), + "retained_prune_observed_source_segments_total": metric_int(stats, "treedb.cache.vlog_retained_prune.observed_source.segments_total"), + "retained_prune_observed_source_bytes_total": metric_int(stats, "treedb.cache.vlog_retained_prune.observed_source.bytes_total"), + "retained_prune_observed_source_candidate_segments_total": metric_int(stats, "treedb.cache.vlog_retained_prune.observed_source.segments_candidate_total"), + "retained_prune_observed_source_candidate_bytes_total": metric_int(stats, "treedb.cache.vlog_retained_prune.observed_source.bytes_candidate_total"), + "retained_prune_observed_source_removed_segments_total": metric_int(stats, "treedb.cache.vlog_retained_prune.observed_source.segments_removed_total"), + "retained_prune_observed_source_removed_bytes_total": metric_int(stats, "treedb.cache.vlog_retained_prune.observed_source.bytes_removed_total"), + "retained_prune_observed_source_in_use_skipped_segments_total": metric_int(stats, "treedb.cache.vlog_retained_prune.observed_source.segments_in_use_skipped_total"), + "retained_prune_observed_source_in_use_skipped_bytes_total": metric_int(stats, "treedb.cache.vlog_retained_prune.observed_source.bytes_in_use_skipped_total"), + "retained_prune_observed_source_live_skipped_segments_total": metric_int(stats, "treedb.cache.vlog_retained_prune.observed_source.segments_live_skipped_total"), + "retained_prune_observed_source_live_skipped_bytes_total": metric_int(stats, "treedb.cache.vlog_retained_prune.observed_source.bytes_live_skipped_total"), + "retained_prune_observed_source_parse_skipped_segments_total": metric_int(stats, "treedb.cache.vlog_retained_prune.observed_source.segments_parse_skipped_total"), + "retained_prune_observed_source_parse_skipped_bytes_total": metric_int(stats, "treedb.cache.vlog_retained_prune.observed_source.bytes_parse_skipped_total"), + "retained_prune_observed_source_zombie_marked_segments_total": metric_int(stats, "treedb.cache.vlog_retained_prune.observed_source.segments_zombie_marked_total"), + "retained_prune_observed_source_zombie_marked_bytes_total": metric_int(stats, "treedb.cache.vlog_retained_prune.observed_source.bytes_zombie_marked_total"), + "observed_gc_pending_ids": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.pending_ids"), + "observed_gc_queued_ids": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.queued_ids"), + "observed_gc_taken_ids": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.taken_ids"), + "observed_gc_runs": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.runs"), + "observed_gc_retry_queued": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.retry_queued"), + "observed_gc_retry_dropped": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.retry_dropped"), + "observed_gc_retry_max_attempts": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.retry_max_attempts"), + "observed_gc_latency_completed_ids": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.latency.completed_ids"), + "observed_gc_latency_dropped_ids": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.latency.dropped_ids"), + "observed_gc_latency_total_ms": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.latency.total_ms"), + "observed_gc_latency_max_ms": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.latency.max_ms"), + "observed_gc_source_segments_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_segments_total"), + "observed_gc_source_segments_eligible_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_segments_eligible_total"), + "observed_gc_source_segments_deleted_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_segments_deleted_total"), + "observed_gc_source_segments_protected_in_use_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_segments_protected_in_use_total"), + "observed_gc_source_segments_protected_retained_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_segments_protected_retained_total"), + "observed_gc_source_segments_protected_overlap_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_segments_protected_overlap_total"), + "observed_gc_source_segments_protected_other_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_segments_protected_other_total"), + "observed_gc_source_bytes_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_bytes_total"), + "observed_gc_source_bytes_eligible_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_bytes_eligible_total"), + "observed_gc_source_bytes_deleted_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_bytes_deleted_total"), + "observed_gc_source_bytes_protected_in_use_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_bytes_protected_in_use_total"), + "observed_gc_source_bytes_protected_retained_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_bytes_protected_retained_total"), + "observed_gc_source_bytes_protected_overlap_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_bytes_protected_overlap_total"), + "observed_gc_source_bytes_protected_other_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_bytes_protected_other_total"), + "checkpoint_kick_runs": metric_int(stats, "treedb.cache.vlog_generation.checkpoint_kick.runs"), + "checkpoint_kick_gc_runs": metric_int(stats, "treedb.cache.vlog_generation.checkpoint_kick.gc_runs"), + "checkpoint_kick_rewrite_runs": metric_int(stats, "treedb.cache.vlog_generation.checkpoint_kick.rewrite_runs"), + "checkpoint_kick_skipped_hot_no_debt": metric_int(stats, "treedb.cache.vlog_generation.checkpoint_kick.skipped_hot_no_debt"), + } + + skip_keys = [ + "treedb.cache.vlog_generation.maintenance.skip.wal_on_periodic", + "treedb.cache.vlog_generation.maintenance.skip.maintenance_phase", + "treedb.cache.vlog_generation.maintenance.skip.stage_gate", + "treedb.cache.vlog_generation.maintenance.skip.stage_gate_not_due", + "treedb.cache.vlog_generation.maintenance.skip.stage_gate_due_reserved", + "treedb.cache.vlog_generation.maintenance.skip.age_blocked_gate", + "treedb.cache.vlog_generation.maintenance.skip.priority_pending", + "treedb.cache.vlog_generation.maintenance.skip.quiet_window", + "treedb.cache.vlog_generation.maintenance.skip.before_first_checkpoint", + "treedb.cache.vlog_generation.maintenance.skip.checkpoint_inflight", + ] + skip_map = {k.split(".")[-1]: metric_int(stats, k) for k in skip_keys} + m["maintenance_skip"] = skip_map + m["maintenance_skip_total"] = sum(skip_map.values()) + + passes_total = m["maintenance_noop"] + m["maintenance_with_rewrite"] + m["maintenance_with_gc"] + m["maintenance_passes_total"] = passes_total + m["maintenance_acquire_rate_pct"] = pct(m["maintenance_acquired"], m["maintenance_attempts"]) + m["maintenance_collision_rate_pct"] = pct(m["maintenance_collisions"], m["maintenance_attempts"]) + m["maintenance_rewrite_pass_share_pct"] = pct(m["maintenance_with_rewrite"], passes_total) + m["maintenance_gc_pass_share_pct"] = pct(m["maintenance_with_gc"], passes_total) + + m["rewrite_plan_select_rate_pct"] = pct(m["rewrite_plan_selected"], m["rewrite_plan_runs"]) + m["rewrite_segment_realization_pct"] = pct( + m["rewrite_exec_source_segments_total"], + m["rewrite_plan_selected_segments_total"], + ) + m["rewrite_source_unreferenced_pct"] = pct( + m["rewrite_exec_source_segments_unreferenced_total"], + m["rewrite_exec_source_segments_requested_total"], + ) + m["rewrite_source_still_referenced_pct"] = pct( + m["rewrite_exec_source_segments_still_referenced_total"], + m["rewrite_exec_source_segments_requested_total"], + ) + m["rewrite_stale_selection_coverage_pct"] = pct( + m["rewrite_processed_stale_bytes"], + m["rewrite_plan_selected_bytes_stale"], + ) + m["rewrite_immediate_reclaim_pct"] = pct( + m["rewrite_reclaimed_bytes"], + m["rewrite_processed_stale_bytes"], + ) + m["rewrite_stale_not_reclaimed_bytes"] = max( + 0, + m["rewrite_processed_stale_bytes"] - m["rewrite_reclaimed_bytes"], + ) + rewrite_secs = m["rewrite_exec_total_ms"] / 1000.0 + m["rewrite_exec_throughput_bytes_per_sec"] = ( + (m["rewrite_bytes_in"] / rewrite_secs) if rewrite_secs > 0 else 0.0 + ) + + gc_secs = m["gc_exec_total_ms"] / 1000.0 + m["gc_delete_throughput_bytes_per_sec"] = ( + (m["gc_deleted_bytes"] / gc_secs) if gc_secs > 0 else 0.0 + ) + + m["observed_gc_drain_pct"] = pct(m["observed_gc_taken_ids"], m["observed_gc_queued_ids"]) + m["observed_gc_latency_finalized_ids"] = m["observed_gc_latency_completed_ids"] + m["observed_gc_latency_dropped_ids"] + m["observed_gc_latency_avg_ms"] = ( + (float(m["observed_gc_latency_total_ms"]) / float(m["observed_gc_latency_finalized_ids"])) + if m["observed_gc_latency_finalized_ids"] > 0 + else 0.0 + ) + m["observed_gc_source_segments_eligible_pct"] = pct( + m["observed_gc_source_segments_eligible_total"], + m["observed_gc_source_segments_total"], + ) + m["observed_gc_source_segments_deleted_pct"] = pct( + m["observed_gc_source_segments_deleted_total"], + m["observed_gc_source_segments_total"], + ) + m["observed_gc_source_bytes_eligible_pct"] = pct( + m["observed_gc_source_bytes_eligible_total"], + m["observed_gc_source_bytes_total"], + ) + m["observed_gc_source_bytes_deleted_pct"] = pct( + m["observed_gc_source_bytes_deleted_total"], + m["observed_gc_source_bytes_total"], + ) + m["observed_gc_source_bytes_deleted_of_eligible_pct"] = pct( + m["observed_gc_source_bytes_deleted_total"], + m["observed_gc_source_bytes_eligible_total"], + ) + m["observed_gc_source_segments_protected_in_use_pct"] = pct( + m["observed_gc_source_segments_protected_in_use_total"], + m["observed_gc_source_segments_total"], + ) + m["observed_gc_source_segments_protected_retained_pct"] = pct( + m["observed_gc_source_segments_protected_retained_total"], + m["observed_gc_source_segments_total"], + ) + m["observed_gc_source_segments_protected_overlap_pct"] = pct( + m["observed_gc_source_segments_protected_overlap_total"], + m["observed_gc_source_segments_total"], + ) + m["observed_gc_source_segments_protected_other_pct"] = pct( + m["observed_gc_source_segments_protected_other_total"], + m["observed_gc_source_segments_total"], + ) + m["observed_gc_source_bytes_protected_in_use_pct"] = pct( + m["observed_gc_source_bytes_protected_in_use_total"], + m["observed_gc_source_bytes_total"], + ) + m["observed_gc_source_bytes_protected_retained_pct"] = pct( + m["observed_gc_source_bytes_protected_retained_total"], + m["observed_gc_source_bytes_total"], + ) + m["observed_gc_source_bytes_protected_overlap_pct"] = pct( + m["observed_gc_source_bytes_protected_overlap_total"], + m["observed_gc_source_bytes_total"], + ) + m["observed_gc_source_bytes_protected_other_pct"] = pct( + m["observed_gc_source_bytes_protected_other_total"], + m["observed_gc_source_bytes_total"], + ) + m["retained_prune_removed_candidate_segments_pct"] = pct( + m["retained_prune_removed_segments"], + m["retained_prune_candidate_segments"], + ) + m["retained_prune_removed_candidate_bytes_pct"] = pct( + m["retained_prune_removed_bytes"], + m["retained_prune_candidate_bytes"], + ) + m["retained_prune_observed_removed_candidate_segments_pct"] = pct( + m["retained_prune_observed_source_removed_segments_total"], + m["retained_prune_observed_source_candidate_segments_total"], + ) + m["retained_prune_observed_removed_candidate_bytes_pct"] = pct( + m["retained_prune_observed_source_removed_bytes_total"], + m["retained_prune_observed_source_candidate_bytes_total"], + ) + m["retained_prune_observed_live_skipped_candidate_segments_pct"] = pct( + m["retained_prune_observed_source_live_skipped_segments_total"], + m["retained_prune_observed_source_candidate_segments_total"], + ) + m["retained_prune_observed_live_skipped_candidate_bytes_pct"] = pct( + m["retained_prune_observed_source_live_skipped_bytes_total"], + m["retained_prune_observed_source_candidate_bytes_total"], + ) + m["vlog_zombie_pinned_bytes_pct"] = pct( + m["vlog_zombie_pinned_bytes"], + m["vlog_zombie_bytes"], + ) + + return m + + +def print_report(summary: dict[str, Any], source_file: Path, run_home: str, instance_name: str) -> None: + print(f"Source file: {source_file}") + if run_home: + print(f"Run home: {run_home}") + if instance_name: + print(f"Instance: {instance_name}") + print("") + + print("Maintenance lane") + print( + " attempts/acquired/collisions: " + f"{summary['maintenance_attempts']} / {summary['maintenance_acquired']} / {summary['maintenance_collisions']} " + f"(acquire={summary['maintenance_acquire_rate_pct']:.1f}%, collision={summary['maintenance_collision_rate_pct']:.1f}%)" + ) + print( + " passes: " + f"total={summary['maintenance_passes_total']} " + f"noop={summary['maintenance_noop']} " + f"rewrite={summary['maintenance_with_rewrite']} " + f"gc={summary['maintenance_with_gc']} " + f"(rewrite_share={summary['maintenance_rewrite_pass_share_pct']:.1f}%, gc_share={summary['maintenance_gc_pass_share_pct']:.1f}%)" + ) + skips = summary["maintenance_skip"] + print( + " skip pressure: " + f"total={summary['maintenance_skip_total']} " + f"pre_checkpoint={skips['before_first_checkpoint']} " + f"stage_gate={skips['stage_gate']} " + f"stage_not_due={skips['stage_gate_not_due']} " + f"age_blocked={skips['age_blocked_gate']} " + f"quiet={skips['quiet_window']} " + f"priority={skips['priority_pending']} " + f"checkpoint={skips['checkpoint_inflight']}" + ) + print( + " checkpoint-kick: " + f"runs={summary['checkpoint_kick_runs']} " + f"rewrite_runs={summary['checkpoint_kick_rewrite_runs']} " + f"gc_runs={summary['checkpoint_kick_gc_runs']} " + f"skipped_hot_no_debt={summary['checkpoint_kick_skipped_hot_no_debt']}" + ) + print("") + + print("Rewrite economics") + print( + " plan runs/selected/empty: " + f"{summary['rewrite_plan_runs']} / {summary['rewrite_plan_selected']} / {summary['rewrite_plan_empty']} " + f"(select_rate={summary['rewrite_plan_select_rate_pct']:.1f}%)" + ) + print( + " plan-empty breakdown: " + f"no_selection={summary['rewrite_plan_empty_no_selection']} " + f"age_blocked={summary['rewrite_plan_empty_age_blocked']}" + ) + print( + " plan penalty-filter: " + f"runs={summary['rewrite_plan_penalty_filter_runs']} " + f"segments={summary['rewrite_plan_penalty_filter_segments']} " + f"to_empty_runs={summary['rewrite_plan_penalty_filter_to_empty_runs']}" + ) + print( + " selected->executed segments: " + f"{summary['rewrite_plan_selected_segments_total']} -> {summary['rewrite_exec_source_segments_total']} " + f"(realization={summary['rewrite_segment_realization_pct']:.1f}%)" + ) + print( + " source outcomes (exec): " + f"requested_total={summary['rewrite_exec_source_segments_requested_total']} " + f"unreferenced_total={summary['rewrite_exec_source_segments_unreferenced_total']} " + f"still_referenced_total={summary['rewrite_exec_source_segments_still_referenced_total']} " + f"(unref_pct={summary['rewrite_source_unreferenced_pct']:.1f}%, still_ref_pct={summary['rewrite_source_still_referenced_pct']:.1f}%) " + f"last=requested:{summary['rewrite_exec_source_segments_requested_last']} " + f"unref:{summary['rewrite_exec_source_segments_unreferenced_last']} " + f"still_ref:{summary['rewrite_exec_source_segments_still_referenced_last']}" + ) + print( + " selected stale vs processed stale: " + f"{human_bytes(summary['rewrite_plan_selected_bytes_stale'])} -> {human_bytes(summary['rewrite_processed_stale_bytes'])} " + f"(coverage={summary['rewrite_stale_selection_coverage_pct']:.1f}%)" + ) + print( + " bytes in/out/reclaimed: " + f"{human_bytes(summary['rewrite_bytes_in'])} / {human_bytes(summary['rewrite_bytes_out'])} / {human_bytes(summary['rewrite_reclaimed_bytes'])}" + ) + print( + " stale processed w/o immediate reclaim: " + f"{human_bytes(summary['rewrite_stale_not_reclaimed_bytes'])} " + f"(immediate_reclaim={summary['rewrite_immediate_reclaim_pct']:.2f}%, no_reclaim_runs={summary['rewrite_no_reclaim_runs']})" + ) + print( + " exec: " + f"runs={summary['rewrite_runs']} total_ms={summary['rewrite_exec_total_ms']:.3f} avg_ms={summary['rewrite_exec_avg_ms']:.3f} " + f"throughput={human_bytes(summary['rewrite_exec_throughput_bytes_per_sec'])}/s" + ) + print( + " debt/budget: " + f"ledger={human_bytes(summary['rewrite_ledger_bytes_total'])} (stale={human_bytes(summary['rewrite_ledger_bytes_stale'])}, segs={summary['rewrite_ledger_segments']}) " + f"age_blocked_ms={summary['rewrite_age_blocked_remaining_ms']} penalties={summary['rewrite_penalties_active']} " + f"budget_consumed={human_bytes(summary['rewrite_budget_consumed_bytes_total'])} " + f"budget_util={summary['rewrite_budget_tokens_utilization_pct']:.1f}%" + ) + print("") + + print("GC economics") + print( + " runs/deleted: " + f"{summary['gc_runs']} / {summary['gc_deleted_segments']} segments, {human_bytes(summary['gc_deleted_bytes'])}" + ) + print( + " exec: " + f"total_ms={summary['gc_exec_total_ms']:.3f} avg_ms={summary['gc_exec_avg_ms']:.3f} " + f"delete_throughput={human_bytes(summary['gc_delete_throughput_bytes_per_sec'])}/s" + ) + print( + " last eligibility/protection: " + f"eligible={human_bytes(summary['gc_last_eligible_bytes'])} " + f"pending={human_bytes(summary['gc_last_pending_bytes'])} " + f"protected_retained={human_bytes(summary['gc_last_protected_retained_bytes'])}" + ) + print( + " checkpoint-kick: " + f"runs={summary['checkpoint_kick_runs']} rewrite_runs={summary['checkpoint_kick_rewrite_runs']} gc_runs={summary['checkpoint_kick_gc_runs']}" + ) + print( + " retained-prune: " + f"runs={summary['retained_prune_runs']} forced={summary['retained_prune_forced_runs']} closed={human_bytes(summary['retained_prune_closed_bytes'])} " + f"candidates={summary['retained_prune_candidate_segments']} ({human_bytes(summary['retained_prune_candidate_bytes'])}) " + f"removed={summary['retained_prune_removed_segments']} ({human_bytes(summary['retained_prune_removed_bytes'])}) " + f"(seg_removed_pct={summary['retained_prune_removed_candidate_segments_pct']:.1f}%, bytes_removed_pct={summary['retained_prune_removed_candidate_bytes_pct']:.1f}%)" + ) + print( + " retained-prune skips: " + f"in_use={summary['retained_prune_in_use_skipped_segments']} ({human_bytes(summary['retained_prune_in_use_skipped_bytes'])}) " + f"live={summary['retained_prune_live_skipped_segments']} ({human_bytes(summary['retained_prune_live_skipped_bytes'])}) " + f"zombie_marked={summary['retained_prune_zombie_marked_segments']} ({human_bytes(summary['retained_prune_zombie_marked_bytes'])})" + ) + print( + " zombie inventory: " + f"total={summary['vlog_zombie_segments']} ({human_bytes(summary['vlog_zombie_bytes'])}) " + f"pinned={summary['vlog_zombie_pinned_segments']} ({human_bytes(summary['vlog_zombie_pinned_bytes'])}) " + f"unpinned={summary['vlog_zombie_unpinned_segments']} ({human_bytes(summary['vlog_zombie_unpinned_bytes'])}) " + f"(pinned_bytes_pct={summary['vlog_zombie_pinned_bytes_pct']:.1f}%)" + ) + print("") + + print("Observed-source replay") + print( + " queued/taken/pending ids: " + f"{summary['observed_gc_queued_ids']} / {summary['observed_gc_taken_ids']} / {summary['observed_gc_pending_ids']} " + f"(drain={summary['observed_gc_drain_pct']:.1f}%, retries={summary['observed_gc_retry_queued']}, runs={summary['observed_gc_runs']})" + ) + print( + " retry budget/latency: " + f"max_attempts={summary['observed_gc_retry_max_attempts']} " + f"retry_dropped={summary['observed_gc_retry_dropped']} " + f"finalized_ids={summary['observed_gc_latency_finalized_ids']} " + f"(completed={summary['observed_gc_latency_completed_ids']}, dropped={summary['observed_gc_latency_dropped_ids']}) " + f"latency total_ms={summary['observed_gc_latency_total_ms']} " + f"avg_ms={summary['observed_gc_latency_avg_ms']:.3f} " + f"max_ms={summary['observed_gc_latency_max_ms']}" + ) + print( + " observed-source totals: " + f"segments total={summary['observed_gc_source_segments_total']} " + f"eligible={summary['observed_gc_source_segments_eligible_total']} " + f"deleted={summary['observed_gc_source_segments_deleted_total']} " + f"(eligible_pct={summary['observed_gc_source_segments_eligible_pct']:.1f}%, deleted_pct={summary['observed_gc_source_segments_deleted_pct']:.1f}%)" + ) + print( + " observed-source bytes: " + f"total={human_bytes(summary['observed_gc_source_bytes_total'])} " + f"eligible={human_bytes(summary['observed_gc_source_bytes_eligible_total'])} " + f"deleted={human_bytes(summary['observed_gc_source_bytes_deleted_total'])} " + f"protected_retained={human_bytes(summary['observed_gc_source_bytes_protected_retained_total'])} " + f"(eligible_pct={summary['observed_gc_source_bytes_eligible_pct']:.1f}%, " + f"deleted_pct={summary['observed_gc_source_bytes_deleted_pct']:.1f}%, " + f"deleted_of_eligible={summary['observed_gc_source_bytes_deleted_of_eligible_pct']:.1f}%)" + ) + print( + " observed-source protection mix: " + f"segments in_use={summary['observed_gc_source_segments_protected_in_use_total']} " + f"retained={summary['observed_gc_source_segments_protected_retained_total']} " + f"overlap={summary['observed_gc_source_segments_protected_overlap_total']} " + f"other={summary['observed_gc_source_segments_protected_other_total']} " + f"(in_use={summary['observed_gc_source_segments_protected_in_use_pct']:.1f}%, " + f"retained={summary['observed_gc_source_segments_protected_retained_pct']:.1f}%, " + f"overlap={summary['observed_gc_source_segments_protected_overlap_pct']:.1f}%, " + f"other={summary['observed_gc_source_segments_protected_other_pct']:.1f}%) " + f"bytes in_use={human_bytes(summary['observed_gc_source_bytes_protected_in_use_total'])} " + f"retained={human_bytes(summary['observed_gc_source_bytes_protected_retained_total'])} " + f"overlap={human_bytes(summary['observed_gc_source_bytes_protected_overlap_total'])} " + f"other={human_bytes(summary['observed_gc_source_bytes_protected_other_total'])} " + f"(in_use={summary['observed_gc_source_bytes_protected_in_use_pct']:.1f}%, " + f"retained={summary['observed_gc_source_bytes_protected_retained_pct']:.1f}%, " + f"overlap={summary['observed_gc_source_bytes_protected_overlap_pct']:.1f}%, " + f"other={summary['observed_gc_source_bytes_protected_other_pct']:.1f}%)" + ) + print( + " observed-source retained-prune totals: " + f"seen={summary['retained_prune_observed_source_segments_total']} ({human_bytes(summary['retained_prune_observed_source_bytes_total'])}) " + f"candidate={summary['retained_prune_observed_source_candidate_segments_total']} ({human_bytes(summary['retained_prune_observed_source_candidate_bytes_total'])}) " + f"removed={summary['retained_prune_observed_source_removed_segments_total']} ({human_bytes(summary['retained_prune_observed_source_removed_bytes_total'])}) " + f"zombie_marked={summary['retained_prune_observed_source_zombie_marked_segments_total']} ({human_bytes(summary['retained_prune_observed_source_zombie_marked_bytes_total'])}) " + f"live_skipped={summary['retained_prune_observed_source_live_skipped_segments_total']} ({human_bytes(summary['retained_prune_observed_source_live_skipped_bytes_total'])}) " + f"in_use_skipped={summary['retained_prune_observed_source_in_use_skipped_segments_total']} ({human_bytes(summary['retained_prune_observed_source_in_use_skipped_bytes_total'])}) " + f"(removed_of_candidate={summary['retained_prune_observed_removed_candidate_segments_pct']:.1f}% seg / " + f"{summary['retained_prune_observed_removed_candidate_bytes_pct']:.1f}% bytes, " + f"live_skip_of_candidate={summary['retained_prune_observed_live_skipped_candidate_segments_pct']:.1f}% seg / " + f"{summary['retained_prune_observed_live_skipped_candidate_bytes_pct']:.1f}% bytes)" + ) + + print("") + notes: list[str] = [] + if summary["rewrite_processed_stale_bytes"] > 0 and summary["rewrite_reclaimed_bytes"] == 0: + notes.append("rewrite copied stale bytes but immediate reclaim is zero; inspect GC eligibility/protection and post-run rewrite window") + if summary["observed_gc_pending_ids"] > 0: + notes.append("observed-source GC backlog still pending; may need longer run window or higher checkpoint-kick pressure") + if summary["observed_gc_retry_dropped"] > 0: + notes.append("observed-source GC retries hit max-attempt budget for some IDs; inspect retained-prune throughput and checkpoint-kick cadence") + if summary["maintenance_collision_rate_pct"] > 20.0: + notes.append("maintenance collision rate is high; lane contention may be throttling rewrite/GC progress") + if summary["rewrite_segment_realization_pct"] < 60.0 and summary["rewrite_plan_selected_segments_total"] > 0: + notes.append("rewrite segment realization is low; staged debt is being selected faster than executed") + if ( + summary["rewrite_exec_source_segments_unreferenced_total"] > 0 + and summary["retained_prune_observed_source_zombie_marked_segments_total"] > 0 + and summary["observed_gc_source_segments_deleted_total"] == 0 + and summary["vlog_zombie_segments"] == 0 + ): + notes.append("rewrite-selected sources became unreferenced and were zombie-marked, but GC delete counters stayed zero; reclaim likely happened via zombie lifecycle outside GC byte accounting") + if not notes: + notes.append("no obvious maintenance-lane bottleneck signature in this snapshot") + + print("Signals") + for note in notes: + print(f" - {note}") + + +def parse_args() -> argparse.Namespace: + p = argparse.ArgumentParser(description="Analyze TreeDB live vlog maintenance capacity from run_celestia diagnostics") + p.add_argument( + "input", + nargs="?", + help="run home dir, diagnostics dir, or debug vars JSON file (default: latest ~/.celestia-app-mainnet-treedb-*)", + ) + p.add_argument( + "--instance-pattern", + default="application.db", + help="prefer instance names containing this substring when debug_vars has multiple DB instances", + ) + p.add_argument("--json", action="store_true", help="emit JSON summary instead of text report") + return p.parse_args() + + +def resolve_source(input_arg: str | None) -> Path: + if input_arg: + p = Path(os.path.expanduser(input_arg)).resolve() + if not p.exists(): + raise FileNotFoundError(f"input does not exist: {p}") + if p.is_file(): + return p + src = find_diagnostics_file(p) + if src is None: + raise FileNotFoundError(f"no diagnostics JSON found under: {p}") + return src + + home = find_latest_home() + if home is None: + raise FileNotFoundError("no ~/.celestia-app-mainnet-treedb-* directories found") + src = find_diagnostics_file(home) + if src is None: + raise FileNotFoundError(f"no diagnostics JSON found under: {home}") + return src + + +def main() -> int: + args = parse_args() + try: + source = resolve_source(args.input) + except FileNotFoundError as exc: + print(f"error: {exc}", file=sys.stderr) + return 2 + + try: + payload = json.loads(source.read_text(encoding="utf-8")) + except Exception as exc: + print(f"error: failed to parse JSON from {source}: {exc}", file=sys.stderr) + return 2 + + stats, instance_name = extract_stats(payload, args.instance_pattern) + if not stats: + print( + "error: could not extract treedb stats map from JSON (expected debug_vars shape or flat stats map)", + file=sys.stderr, + ) + return 2 + + summary = build_summary(stats) + run_home = find_home_from_path(source) + + if args.json: + out = { + "source_file": str(source), + "run_home": run_home, + "instance": instance_name, + "summary": summary, + } + print(json.dumps(out, indent=2, sort_keys=True)) + else: + print_report(summary, source, run_home, instance_name) + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/celestia_fast_gate.sh b/scripts/celestia_fast_gate.sh new file mode 100755 index 000000000..59d93e551 --- /dev/null +++ b/scripts/celestia_fast_gate.sh @@ -0,0 +1,785 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd) +cd "$ROOT" + +BASELINE_HASH="${BASELINE_HASH:-origin/main}" +CANDIDATE_UNIFIED_BIN="${CANDIDATE_UNIFIED_BIN:-}" +CANDIDATE_TREEMAP_BIN="${CANDIDATE_TREEMAP_BIN:-}" +BASELINE_UNIFIED_BIN="${BASELINE_UNIFIED_BIN:-}" +BASELINE_TREEMAP_BIN="${BASELINE_TREEMAP_BIN:-}" +SCRIPT_GOWORK="${SCRIPT_GOWORK:-off}" + +MAX_PAIRS="${MAX_PAIRS:-6}" +MIN_PAIRS="${MIN_PAIRS:-3}" +CLEAR_WIN_PAIRS="${CLEAR_WIN_PAIRS:-2}" +CLEAR_LOSS_PAIRS="${CLEAR_LOSS_PAIRS:-2}" +STOP_ON_CLEAR="${STOP_ON_CLEAR:-1}" +LOW_SIGNAL_MIN_PAIRS="${LOW_SIGNAL_MIN_PAIRS:-3}" +LOW_SIGNAL_NEUTRAL_STREAK="${LOW_SIGNAL_NEUTRAL_STREAK:-3}" +SLEEP_BETWEEN_RUNS_SECONDS="${SLEEP_BETWEEN_RUNS_SECONDS:-2}" + +SIZE_FIELD="${SIZE_FIELD:-s_post_app_bytes}" +SIZE_TOLERANCE_BYTES="${SIZE_TOLERANCE_BYTES:-67108864}" +TIME_TOLERANCE_SECONDS="${TIME_TOLERANCE_SECONDS:-30}" + +PROFILE="${PROFILE:-fast}" +DBS="${DBS:-treedb}" +TESTS="${TESTS:-batch_write}" +KEYS="${KEYS:-500000}" +VALSIZE="${VALSIZE:-128}" +BATCHSIZE="${BATCHSIZE:-8000}" +VAL_PATTERN="${VAL_PATTERN:-celestia_height_prefix_fill}" +SEED="${SEED:-1}" + +FORCE_VALUE_POINTERS="${FORCE_VALUE_POINTERS:-true}" +OUTER_LEAVES_IN_VLOG="${OUTER_LEAVES_IN_VLOG:-true}" +VLOG_COMPRESSION="${VLOG_COMPRESSION:-dict}" +VLOG_COMPRESSION_AUTOTUNE="${VLOG_COMPRESSION_AUTOTUNE:-aggressive}" +VLOG_COMPRESSION_VARIANT="${VLOG_COMPRESSION_VARIANT:-dict}" +DICT_TRAIN_BYTES="${DICT_TRAIN_BYTES:-1048576}" +DICT_BYTES="${DICT_BYTES:-32768}" +VLOG_REWRITE_MIN_SEGMENT_AGE_MS="${VLOG_REWRITE_MIN_SEGMENT_AGE_MS:-}" + +REWRITE_ENABLED="${REWRITE_ENABLED:-1}" +REWRITE_ARGS="${REWRITE_ARGS:--rw}" +MEASURE_GZIP="${MEASURE_GZIP:-1}" +KEEP_DB_DIRS="${KEEP_DB_DIRS:-1}" + +COMMON_EXTRA_FLAGS="${COMMON_EXTRA_FLAGS:-}" +CONTROL_EXTRA_FLAGS="${CONTROL_EXTRA_FLAGS:-}" +CANDIDATE_EXTRA_FLAGS="${CANDIDATE_EXTRA_FLAGS:-}" + +TS="$(date +%Y%m%d%H%M%S)" +OUT="${OUT_DIR:-$ROOT/artifacts/celestia_fast_gate/$TS}" + +WORKTREE_PATH="" + +require_cmd() { + if ! command -v "$1" >/dev/null 2>&1; then + echo "missing required command: $1" >&2 + exit 2 + fi +} + +du_bytes() { + local target="$1" + if [[ ! -e "$target" ]]; then + echo 0 + return 0 + fi + if du -sb "$target" >/dev/null 2>&1; then + du -sb "$target" 2>/dev/null | awk '{print $1}' + return 0 + fi + du -sk "$target" 2>/dev/null | awk '{print $1 * 1024}' +} + +gzip_dir_bytes() { + local target="$1" + if [[ "$MEASURE_GZIP" != "1" ]]; then + echo 0 + return 0 + fi + if [[ ! -d "$target" ]]; then + echo 0 + return 0 + fi + tar -C "$target" -cf - . 2>/dev/null | gzip -1 -c | wc -c | tr -d '[:space:]' +} + +cleanup() { + if [[ -n "$WORKTREE_PATH" && -d "$WORKTREE_PATH" ]]; then + git worktree remove --force "$WORKTREE_PATH" >/dev/null 2>&1 || true + fi +} +trap cleanup EXIT + +parse_bench_log() { + local log_path="$1" + python3 - "$log_path" <<'PY' +import re +import sys +from pathlib import Path + +path = Path(sys.argv[1]) +text = path.read_text(encoding="utf-8", errors="replace") +lines = text.splitlines() + +throughput = None +for line in lines: + m = re.search(r"Batch Write\s*/\s*TreeDB[^=]*=\s*([0-9][0-9,]*(?:\.[0-9]+)?)", line) + if m: + throughput = float(m.group(1).replace(",", "")) + break +if throughput is None: + for line in lines: + m = re.match(r"\s*Batch Write\s+([0-9][0-9,]*(?:\.[0-9]+)?)\s*$", line) + if m: + throughput = float(m.group(1).replace(",", "")) + break + +keep_dir = "" +in_keep_block = False +for line in lines: + stripped = line.strip() + if stripped == "Kept Data Directories": + in_keep_block = True + continue + if in_keep_block: + if not stripped: + continue + if stripped.startswith("TreeDB (") and ":" in stripped: + maybe = stripped.split(":", 1)[1].strip() + if maybe.startswith("/"): + keep_dir = maybe + break + +if not keep_dir: + m = re.search(r"TreeDB \([^\)]*\):\s+(/tmp/bench[^\s]+)", text) + if m: + keep_dir = m.group(1) + +if not keep_dir: + raise SystemExit("unable to locate kept data directory in unified-bench output") + +if throughput is None: + throughput = 0.0 + +print(f"{keep_dir}\t{throughput}") +PY +} + +setup_bins() { + mkdir -p "$OUT/bin" "$OUT/worktrees" "$OUT/runs" + + if [[ -z "$CANDIDATE_UNIFIED_BIN" ]]; then + CANDIDATE_UNIFIED_BIN="$OUT/bin/unified-bench-candidate" + GOWORK="$SCRIPT_GOWORK" go build -o "$CANDIDATE_UNIFIED_BIN" ./cmd/unified_bench + fi + if [[ ! -x "$CANDIDATE_UNIFIED_BIN" ]]; then + echo "candidate unified-bench binary not executable: $CANDIDATE_UNIFIED_BIN" >&2 + exit 2 + fi + + if [[ -z "$CANDIDATE_TREEMAP_BIN" ]]; then + CANDIDATE_TREEMAP_BIN="$OUT/bin/treemap-candidate" + GOWORK="$SCRIPT_GOWORK" go build -o "$CANDIDATE_TREEMAP_BIN" ./TreeDB/cmd/treemap + fi + if [[ ! -x "$CANDIDATE_TREEMAP_BIN" ]]; then + echo "candidate treemap binary not executable: $CANDIDATE_TREEMAP_BIN" >&2 + exit 2 + fi + + if [[ -n "$BASELINE_UNIFIED_BIN" && -n "$BASELINE_TREEMAP_BIN" ]]; then + if [[ ! -x "$BASELINE_UNIFIED_BIN" ]]; then + echo "baseline unified-bench binary not executable: $BASELINE_UNIFIED_BIN" >&2 + exit 2 + fi + if [[ ! -x "$BASELINE_TREEMAP_BIN" ]]; then + echo "baseline treemap binary not executable: $BASELINE_TREEMAP_BIN" >&2 + exit 2 + fi + return 0 + fi + + if ! git cat-file -e "${BASELINE_HASH}^{commit}" >/dev/null 2>&1; then + git fetch --no-tags --depth=1 origin "$BASELINE_HASH" >/dev/null 2>&1 || git fetch --no-tags origin "$BASELINE_HASH" >/dev/null 2>&1 + fi + + WORKTREE_PATH="$OUT/worktrees/baseline" + git worktree add --detach "$WORKTREE_PATH" "$BASELINE_HASH" >/dev/null + + if [[ -z "$BASELINE_UNIFIED_BIN" ]]; then + BASELINE_UNIFIED_BIN="$OUT/bin/unified-bench-baseline" + ( + cd "$WORKTREE_PATH" + GOWORK="$SCRIPT_GOWORK" go build -o "$BASELINE_UNIFIED_BIN" ./cmd/unified_bench + ) + fi + if [[ -z "$BASELINE_TREEMAP_BIN" ]]; then + BASELINE_TREEMAP_BIN="$OUT/bin/treemap-baseline" + ( + cd "$WORKTREE_PATH" + GOWORK="$SCRIPT_GOWORK" go build -o "$BASELINE_TREEMAP_BIN" ./TreeDB/cmd/treemap + ) + fi + + if [[ ! -x "$BASELINE_UNIFIED_BIN" ]]; then + echo "baseline unified-bench binary not executable: $BASELINE_UNIFIED_BIN" >&2 + exit 2 + fi + if [[ ! -x "$BASELINE_TREEMAP_BIN" ]]; then + echo "baseline treemap binary not executable: $BASELINE_TREEMAP_BIN" >&2 + exit 2 + fi +} + +run_variant() { + local pair_index="$1" + local variant="$2" + + local bench_bin treemap_bin extra_flags + if [[ "$variant" == "candidate" ]]; then + bench_bin="$CANDIDATE_UNIFIED_BIN" + treemap_bin="$CANDIDATE_TREEMAP_BIN" + extra_flags="$CANDIDATE_EXTRA_FLAGS" + else + bench_bin="$BASELINE_UNIFIED_BIN" + treemap_bin="$BASELINE_TREEMAP_BIN" + extra_flags="$CONTROL_EXTRA_FLAGS" + fi + + local run_id + run_id=$(printf "%02d_%s" "$pair_index" "$variant") + local run_dir="$OUT/runs/$run_id" + mkdir -p "$run_dir" + + local cmd=( + "$bench_bin" + -profile "$PROFILE" + -dbs "$DBS" + -keys "$KEYS" + -valsize "$VALSIZE" + -batchsize "$BATCHSIZE" + -test "$TESTS" + -val-pattern "$VAL_PATTERN" + -seed "$SEED" + -progress=false + -keep + -treedb-force-value-pointers="$FORCE_VALUE_POINTERS" + -treedb-index-outer-leaves-in-vlog="$OUTER_LEAVES_IN_VLOG" + -treedb-vlog-compression "$VLOG_COMPRESSION" + -treedb-vlog-compression-autotune "$VLOG_COMPRESSION_AUTOTUNE" + -treedb-vlog-compression-variant "$VLOG_COMPRESSION_VARIANT" + -treedb-vlog-dict-train-bytes "$DICT_TRAIN_BYTES" + -treedb-vlog-dict-dict-bytes "$DICT_BYTES" + ) + if [[ -n "$VLOG_REWRITE_MIN_SEGMENT_AGE_MS" ]]; then + cmd+=(-treedb-vlog-rewrite-min-segment-age-ms "$VLOG_REWRITE_MIN_SEGMENT_AGE_MS") + fi + + if [[ -n "$COMMON_EXTRA_FLAGS" ]]; then + # shellcheck disable=SC2206 + local common_extra=( $COMMON_EXTRA_FLAGS ) + cmd+=("${common_extra[@]}") + fi + if [[ -n "$extra_flags" ]]; then + # shellcheck disable=SC2206 + local variant_extra=( $extra_flags ) + cmd+=("${variant_extra[@]}") + fi + + printf '%q ' "${cmd[@]}" >"$run_dir/cmd.txt" + echo >>"$run_dir/cmd.txt" + + local bench_log="$run_dir/unified.log" + local run_start run_end + run_start=$(date +%s) + "${cmd[@]}" >"$bench_log" 2>&1 + run_end=$(date +%s) + + local parse_out keep_dir batch_write_ops + parse_out="$(parse_bench_log "$bench_log")" + keep_dir="${parse_out%%$'\t'*}" + batch_write_ops="${parse_out#*$'\t'}" + + if [[ -z "$keep_dir" || ! -d "$keep_dir" ]]; then + echo "missing kept dir for $run_id (parsed=$keep_dir)" >&2 + exit 1 + fi + + local sync_app_bytes sync_wal_bytes sync_gzip_bytes + sync_app_bytes="$(du_bytes "$keep_dir")" + sync_wal_bytes="$(du_bytes "$keep_dir/maindb/wal")" + sync_gzip_bytes="$(gzip_dir_bytes "$keep_dir")" + + local rewrite_attempted=0 + local rewrite_seconds=0 + local rewrite_rc=0 + local rewrite_log="$run_dir/rewrite.log" + if [[ "$REWRITE_ENABLED" == "1" ]]; then + rewrite_attempted=1 + local rw_start rw_end + rw_start=$(date +%s) + # shellcheck disable=SC2206 + local rw_args=( $REWRITE_ARGS ) + set +e + "$treemap_bin" vlog-rewrite "$keep_dir" "${rw_args[@]}" >"$rewrite_log" 2>&1 + rewrite_rc=$? + set -e + rw_end=$(date +%s) + rewrite_seconds=$((rw_end - rw_start)) + fi + + local post_app_bytes post_wal_bytes post_gzip_bytes + post_app_bytes="$(du_bytes "$keep_dir")" + post_wal_bytes="$(du_bytes "$keep_dir/maindb/wal")" + post_gzip_bytes="$(gzip_dir_bytes "$keep_dir")" + + local run_json="$run_dir/run.json" + python3 - "$run_json" "$pair_index" "$variant" "$run_start" "$run_end" "$rewrite_attempted" "$rewrite_seconds" "$rewrite_rc" "$batch_write_ops" "$keep_dir" "$sync_app_bytes" "$sync_wal_bytes" "$sync_gzip_bytes" "$post_app_bytes" "$post_wal_bytes" "$post_gzip_bytes" <<'PY' +import json +import sys +from pathlib import Path + +out_path = Path(sys.argv[1]) +pair_index = int(sys.argv[2]) +variant = sys.argv[3] +run_start = int(sys.argv[4]) +run_end = int(sys.argv[5]) +rewrite_attempted = int(sys.argv[6]) +rewrite_seconds = int(sys.argv[7]) +rewrite_rc = int(sys.argv[8]) +batch_write_ops = float(sys.argv[9]) +keep_dir = sys.argv[10] +s_sync_app = int(sys.argv[11]) +s_sync_wal = int(sys.argv[12]) +s_sync_gzip = int(sys.argv[13]) +s_post_app = int(sys.argv[14]) +s_post_wal = int(sys.argv[15]) +s_post_gzip = int(sys.argv[16]) + +t_sync = max(0, run_end - run_start) +t_rewrite = rewrite_seconds if rewrite_attempted == 1 else 0 +if rewrite_attempted == 1 and rewrite_rc != 0: + t_total = None +else: + t_total = t_sync + t_rewrite + +payload = { + "pair_index": pair_index, + "variant": variant, + "keep_dir": keep_dir, + "bench": { + "duration_seconds": t_sync, + "batch_write_ops_per_sec": batch_write_ops, + }, + "rewrite": { + "attempted": rewrite_attempted == 1, + "seconds": t_rewrite, + "exit_code": rewrite_rc, + }, + "sizes": { + "sync_app_bytes": s_sync_app, + "sync_wal_bytes": s_sync_wal, + "sync_gzip_bytes": s_sync_gzip, + "post_app_bytes": s_post_app, + "post_wal_bytes": s_post_wal, + "post_gzip_bytes": s_post_gzip, + }, + "metrics": { + "t_sync_seconds": t_sync, + "t_rewrite_seconds": t_rewrite, + "t_total_seconds": t_total, + "batch_write_ops_per_sec": batch_write_ops, + "s_sync_app_bytes": s_sync_app, + "s_sync_wal_bytes": s_sync_wal, + "s_sync_gzip_bytes": s_sync_gzip, + "s_post_app_bytes": s_post_app, + "s_post_wal_bytes": s_post_wal, + "s_post_gzip_bytes": s_post_gzip, + }, +} +out_path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8") +print(out_path) +PY + + if [[ "$KEEP_DB_DIRS" != "1" ]]; then + rm -rf "$keep_dir" + fi + + echo "run_id=$run_id keep_dir=$keep_dir json=$run_json" +} + +aggregate_and_decide() { + local decision_json="$OUT/decision.json" + python3 - "$OUT" "$SIZE_FIELD" "$SIZE_TOLERANCE_BYTES" "$TIME_TOLERANCE_SECONDS" "$MIN_PAIRS" "$CLEAR_WIN_PAIRS" "$CLEAR_LOSS_PAIRS" "$MAX_PAIRS" "$STOP_ON_CLEAR" "$LOW_SIGNAL_MIN_PAIRS" "$LOW_SIGNAL_NEUTRAL_STREAK" "$decision_json" <<'PY' +import csv +import json +import statistics +import sys +from pathlib import Path + +out = Path(sys.argv[1]) +size_field = sys.argv[2] +size_tol = int(sys.argv[3]) +time_tol = int(sys.argv[4]) +min_pairs = int(sys.argv[5]) +clear_win_pairs = int(sys.argv[6]) +clear_loss_pairs = int(sys.argv[7]) +max_pairs = int(sys.argv[8]) +stop_on_clear = sys.argv[9] == "1" +low_signal_min_pairs = int(sys.argv[10]) +low_signal_neutral_streak = int(sys.argv[11]) +decision_path = Path(sys.argv[12]) + +run_files = sorted(out.glob("runs/*/run.json")) +runs = [] +for p in run_files: + try: + runs.append(json.loads(p.read_text(encoding="utf-8"))) + except Exception: + continue +runs.sort(key=lambda r: (int(r.get("pair_index", 0)), str(r.get("variant", "")))) + +runs_csv = out / "runs.csv" +with runs_csv.open("w", newline="", encoding="utf-8") as fh: + w = csv.writer(fh) + w.writerow([ + "pair_index", + "variant", + "keep_dir", + "t_sync_seconds", + "t_rewrite_seconds", + "t_total_seconds", + "batch_write_ops_per_sec", + "s_sync_app_bytes", + "s_sync_wal_bytes", + "s_sync_gzip_bytes", + "s_post_app_bytes", + "s_post_wal_bytes", + "s_post_gzip_bytes", + "rewrite_exit_code", + ]) + for r in runs: + m = r.get("metrics", {}) or {} + rw = r.get("rewrite", {}) or {} + w.writerow([ + int(r.get("pair_index", 0)), + str(r.get("variant", "")), + str(r.get("keep_dir", "")), + m.get("t_sync_seconds"), + m.get("t_rewrite_seconds"), + m.get("t_total_seconds"), + m.get("batch_write_ops_per_sec"), + m.get("s_sync_app_bytes"), + m.get("s_sync_wal_bytes"), + m.get("s_sync_gzip_bytes"), + m.get("s_post_app_bytes"), + m.get("s_post_wal_bytes"), + m.get("s_post_gzip_bytes"), + rw.get("exit_code"), + ]) + +by_pair = {} +for r in runs: + pair = int(r.get("pair_index", 0)) + by_pair.setdefault(pair, {})[str(r.get("variant", ""))] = r + +def delta(a, b): + if a is None or b is None: + return None + try: + return a - b + except Exception: + return None + +pair_rows = [] +wins = 0 +losses = 0 +for pair in sorted(by_pair): + row = by_pair[pair] + ctrl = row.get("control") + cand = row.get("candidate") + if not ctrl or not cand: + continue + + cm = cand.get("metrics", {}) or {} + bm = ctrl.get("metrics", {}) or {} + + d_sync = delta(cm.get("t_sync_seconds"), bm.get("t_sync_seconds")) + d_total = delta(cm.get("t_total_seconds"), bm.get("t_total_seconds")) + d_bw = delta(cm.get("batch_write_ops_per_sec"), bm.get("batch_write_ops_per_sec")) + + d_sync_app = delta(cm.get("s_sync_app_bytes"), bm.get("s_sync_app_bytes")) + d_sync_wal = delta(cm.get("s_sync_wal_bytes"), bm.get("s_sync_wal_bytes")) + d_sync_gzip = delta(cm.get("s_sync_gzip_bytes"), bm.get("s_sync_gzip_bytes")) + d_post_app = delta(cm.get("s_post_app_bytes"), bm.get("s_post_app_bytes")) + d_post_wal = delta(cm.get("s_post_wal_bytes"), bm.get("s_post_wal_bytes")) + d_post_gzip = delta(cm.get("s_post_gzip_bytes"), bm.get("s_post_gzip_bytes")) + + d_size_primary = delta(cm.get(size_field), bm.get(size_field)) + + outcome = "neutral" + if d_size_primary is not None and d_total is not None: + win = (d_size_primary <= -size_tol) and (d_total <= time_tol) + loss = (d_size_primary >= size_tol) and (d_total >= -time_tol) + if win and not loss: + outcome = "win" + wins += 1 + elif loss and not win: + outcome = "loss" + losses += 1 + + pair_rows.append( + { + "pair_index": pair, + "delta_t_sync_seconds": d_sync, + "delta_t_total_seconds": d_total, + "delta_batch_write_ops_per_sec": d_bw, + "delta_s_sync_app_bytes": d_sync_app, + "delta_s_sync_wal_bytes": d_sync_wal, + "delta_s_sync_gzip_bytes": d_sync_gzip, + "delta_s_post_app_bytes": d_post_app, + "delta_s_post_wal_bytes": d_post_wal, + "delta_s_post_gzip_bytes": d_post_gzip, + "delta_size_primary_bytes": d_size_primary, + "outcome": outcome, + } + ) + +pairs_csv = out / "pairs.csv" +with pairs_csv.open("w", newline="", encoding="utf-8") as fh: + w = csv.writer(fh) + w.writerow( + [ + "pair_index", + "delta_t_sync_seconds", + "delta_t_total_seconds", + "delta_batch_write_ops_per_sec", + "delta_s_sync_app_bytes", + "delta_s_sync_wal_bytes", + "delta_s_sync_gzip_bytes", + "delta_s_post_app_bytes", + "delta_s_post_wal_bytes", + "delta_s_post_gzip_bytes", + "delta_size_primary_bytes", + "outcome", + ] + ) + for r in pair_rows: + w.writerow( + [ + r["pair_index"], + r["delta_t_sync_seconds"], + r["delta_t_total_seconds"], + r["delta_batch_write_ops_per_sec"], + r["delta_s_sync_app_bytes"], + r["delta_s_sync_wal_bytes"], + r["delta_s_sync_gzip_bytes"], + r["delta_s_post_app_bytes"], + r["delta_s_post_wal_bytes"], + r["delta_s_post_gzip_bytes"], + r["delta_size_primary_bytes"], + r["outcome"], + ] + ) + +completed_pairs = len(pair_rows) +neutral = max(0, completed_pairs - wins - losses) +neutral_streak = 0 +for row in reversed(pair_rows): + if row.get("outcome") == "neutral": + neutral_streak += 1 + continue + break + +reason = "continue" +stop = False +if stop_on_clear and completed_pairs >= min_pairs: + if wins >= clear_win_pairs and wins > losses: + stop = True + reason = "clear_improvement" + elif losses >= clear_loss_pairs and losses > wins: + stop = True + reason = "clear_regression" + else: + remaining = max(0, max_pairs - completed_pairs) + can_reach_clear_win = (wins + remaining) >= clear_win_pairs + can_reach_clear_loss = (losses + remaining) >= clear_loss_pairs + if not can_reach_clear_win and not can_reach_clear_loss: + stop = True + reason = "futile_remaining_pairs" + +if (not stop) and completed_pairs >= low_signal_min_pairs and neutral_streak >= low_signal_neutral_streak: + stop = True + reason = "low_signal_neutral_streak" + +if (not stop) and completed_pairs >= max_pairs: + stop = True + reason = "max_pairs" + +med_delta_size = None +med_delta_total = None +size_values = [r["delta_size_primary_bytes"] for r in pair_rows if r.get("delta_size_primary_bytes") is not None] +time_values = [r["delta_t_total_seconds"] for r in pair_rows if r.get("delta_t_total_seconds") is not None] +if size_values: + med_delta_size = statistics.median(size_values) +if time_values: + med_delta_total = statistics.median(time_values) + +summary_md = out / "summary.md" +lines = [] +lines.append("# celestia_fast_gate summary") +lines.append("") +lines.append(f"- completed pairs: `{completed_pairs}`") +lines.append(f"- wins/losses/neutral: `{wins}` / `{losses}` / `{neutral}`") +lines.append(f"- neutral streak (tail): `{neutral_streak}`") +lines.append(f"- size field: `{size_field}`") +lines.append(f"- size tolerance bytes: `{size_tol}`") +lines.append(f"- time tolerance seconds: `{time_tol}`") +lines.append(f"- low-signal min pairs: `{low_signal_min_pairs}`") +lines.append(f"- low-signal neutral streak: `{low_signal_neutral_streak}`") +lines.append(f"- median delta(size): `{med_delta_size}`") +lines.append(f"- median delta(time_total): `{med_delta_total}`") +lines.append(f"- decision: `{reason}`") +lines.append("") +lines.append("## Artifacts") +lines.append("") +lines.append(f"- runs csv: `{runs_csv}`") +lines.append(f"- pairs csv: `{pairs_csv}`") +lines.append(f"- per-run json: `{out / 'runs'}`") +summary_md.write_text("\n".join(lines) + "\n", encoding="utf-8") + +review_md = out / "process_review.md" +review = [] +review.append("# Fast Loop Review") +review.append("") +review.append("## Signal Check") +review.append("") +review.append(f"- completed_pairs={completed_pairs}") +review.append(f"- neutral_streak={neutral_streak}") +review.append(f"- reason={reason}") +if med_delta_size is not None: + review.append(f"- median_delta_size_bytes={int(med_delta_size)}") +if med_delta_total is not None: + review.append(f"- median_delta_time_seconds={int(med_delta_total)}") +review.append("") +review.append("## Suggested Next Action") +review.append("") +if reason in {"low_signal_neutral_streak", "futile_remaining_pairs"}: + review.append("- Stop long validation; this loop is currently low-signal for the configured tolerance.") + review.append("- Increase expected effect size (bundle larger code changes) or increase micro workload stress before re-running.") +elif reason == "clear_regression": + review.append("- Reject candidate as-is; run pprof on this fast gate to isolate removable overhead before retrying.") +elif reason == "clear_improvement": + review.append("- Promote candidate to run_celestia A/B confirmation.") +else: + review.append("- Continue collecting interleaved pairs until a clear outcome or low-signal stop triggers.") +review_md.write_text("\n".join(review) + "\n", encoding="utf-8") + +payload = { + "completed_pairs": completed_pairs, + "wins": wins, + "losses": losses, + "neutral": neutral, + "neutral_streak": neutral_streak, + "size_field": size_field, + "size_tolerance_bytes": size_tol, + "time_tolerance_seconds": time_tol, + "median_delta_size_bytes": med_delta_size, + "median_delta_time_seconds": med_delta_total, + "stop": stop, + "reason": reason, +} +decision_path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8") +print(json.dumps(payload, sort_keys=True)) +PY +} + +run_pair() { + local pair_index="$1" + if (( pair_index % 2 == 1 )); then + run_variant "$pair_index" "control" + sleep "$SLEEP_BETWEEN_RUNS_SECONDS" + run_variant "$pair_index" "candidate" + else + run_variant "$pair_index" "candidate" + sleep "$SLEEP_BETWEEN_RUNS_SECONDS" + run_variant "$pair_index" "control" + fi +} + +require_cmd git +require_cmd go +require_cmd python3 +require_cmd tar +require_cmd gzip +require_cmd wc + +if (( MAX_PAIRS < 1 )); then + echo "MAX_PAIRS must be >= 1" >&2 + exit 2 +fi + +mkdir -p "$OUT" +setup_bins + +cat >"$OUT/meta.txt" </dev/null 2>&1; then + echo "python3 is required" >&2 + exit 1 +fi +if [[ ! -x "$ANALYZER" ]]; then + echo "analyzer not found/executable: $ANALYZER" >&2 + exit 1 +fi +if [[ "$MAX_PAIRS" -lt 1 ]]; then + echo "MAX_PAIRS must be >= 1" >&2 + exit 1 +fi + +mkdir -p "$OUT/runs" + +cat >"$OUT/meta.txt" </dev/null || true +} + +du_bytes() { + local target="$1" + if [[ ! -e "$target" ]]; then + echo 0 + return 0 + fi + if du -sb "$target" >/dev/null 2>&1; then + du -sb "$target" 2>/dev/null | awk '{print $1}' + return 0 + fi + du -sk "$target" 2>/dev/null | awk '{print $1 * 1024}' +} + +detect_new_run_home() { + local before_file="$1" + local -A seen=() + while IFS= read -r path; do + [[ -n "$path" ]] && seen["$path"]=1 + done <"$before_file" + + while IFS= read -r path; do + if [[ -z "$path" ]]; then + continue + fi + if [[ -z "${seen[$path]+x}" ]]; then + echo "$path" + return 0 + fi + done < <(list_run_homes) + + list_run_homes | head -n 1 +} + +run_variant() { + local pair_index="$1" + local variant="$2" + local env_file="$3" + + local run_id + run_id=$(printf "%02d_%s" "$pair_index" "$variant") + local run_dir="$OUT/runs/$run_id" + mkdir -p "$run_dir" + + local before_file="$run_dir/before_homes.txt" + list_run_homes >"$before_file" + + local run_start + run_start=$(date +%s) + ( + set -euo pipefail + if [[ -n "$env_file" ]]; then + # shellcheck source=/dev/null + set -a + source "$env_file" + set +a + fi + # Non-login shell avoids user profile side effects (e.g. tty-dependent exports) + # that can fail under nohup/background runs. + bash -c "$RUN_CMD" + ) >"$run_dir/launcher.log" 2>&1 + local run_end + run_end=$(date +%s) + + local run_home + run_home="$(detect_new_run_home "$before_file")" + if [[ -z "$run_home" || ! -d "$run_home" ]]; then + echo "failed to detect run home for $run_id" >&2 + exit 1 + fi + + local app_db="$run_home/data/application.db" + local pre_app_bytes pre_wal_bytes + pre_app_bytes="$(du_bytes "$app_db")" + pre_wal_bytes="$(du_bytes "$app_db/maindb/wal")" + + local analyze_json="$run_dir/maintenance.json" + if ! "$ANALYZER" --json "$run_home" >"$analyze_json" 2>"$run_dir/analyze.stderr.log"; then + rm -f "$analyze_json" + fi + + local rewrite_attempted=0 + local rewrite_seconds=0 + local rewrite_rc=0 + if [[ "$REWRITE_ENABLED" == "1" && -x "$TREEMAP_BIN" && -d "$app_db" ]]; then + rewrite_attempted=1 + local rewrite_start + rewrite_start=$(date +%s) + set +e + "$TREEMAP_BIN" vlog-rewrite "$app_db" -rw >"$run_dir/rewrite.log" 2>&1 + rewrite_rc=$? + set -e + local rewrite_end + rewrite_end=$(date +%s) + rewrite_seconds=$((rewrite_end - rewrite_start)) + else + rewrite_rc=0 + fi + + local post_app_bytes post_wal_bytes + post_app_bytes="$(du_bytes "$app_db")" + post_wal_bytes="$(du_bytes "$app_db/maindb/wal")" + + local run_json="$run_dir/run.json" + python3 - "$run_home" "$run_json" "$variant" "$pair_index" "$run_start" "$run_end" "$rewrite_attempted" "$rewrite_seconds" "$rewrite_rc" "$pre_app_bytes" "$pre_wal_bytes" "$post_app_bytes" "$post_wal_bytes" "$analyze_json" <<'PY' +import json +import sys +from pathlib import Path + +run_home = Path(sys.argv[1]) +out_path = Path(sys.argv[2]) +variant = sys.argv[3] +pair_index = int(sys.argv[4]) +run_start = int(sys.argv[5]) +run_end = int(sys.argv[6]) +rewrite_attempted = int(sys.argv[7]) +rewrite_seconds = int(sys.argv[8]) +rewrite_rc = int(sys.argv[9]) +pre_app_bytes = int(sys.argv[10]) +pre_wal_bytes = int(sys.argv[11]) +post_app_bytes = int(sys.argv[12]) +post_wal_bytes = int(sys.argv[13]) +analyze_json_path = Path(sys.argv[14]) + +def parse_sync_time(path: Path) -> dict[str, str]: + out: dict[str, str] = {} + if not path.is_file(): + return out + for line in path.read_text(encoding="utf-8", errors="replace").splitlines(): + line = line.strip() + if not line or line == "---" or "=" not in line: + continue + k, v = line.split("=", 1) + out[k.strip()] = v.strip() + return out + +def safe_int(raw: str | None, default: int = 0) -> int: + if raw is None: + return default + s = str(raw).strip() + if not s: + return default + try: + return int(s) + except Exception: + try: + return int(float(s)) + except Exception: + return default + +sync = parse_sync_time(run_home / "sync" / "sync-time.log") +maintenance = {} +if analyze_json_path.is_file(): + try: + payload = json.loads(analyze_json_path.read_text(encoding="utf-8")) + if isinstance(payload, dict): + summary = payload.get("summary") + if isinstance(summary, dict): + maintenance = summary + except Exception: + maintenance = {} + +t_sync = safe_int(sync.get("duration_seconds"), max(0, run_end - run_start)) +t_rw = rewrite_seconds if rewrite_attempted == 1 else 0 +if rewrite_attempted == 1 and rewrite_rc != 0: + t_total = None +else: + t_total = t_sync + t_rw + +result = { + "pair_index": pair_index, + "variant": variant, + "run_home": str(run_home), + "sync": { + "duration_seconds": t_sync, + "max_rss_kb": safe_int(sync.get("max_rss_kb"), 0), + "max_hwm_kb": safe_int(sync.get("max_hwm_kb"), 0), + "end_app_bytes": safe_int(sync.get("end_app_bytes"), pre_app_bytes), + "end_data_bytes": safe_int(sync.get("end_data_bytes"), 0), + "end_home_bytes": safe_int(sync.get("end_home_bytes"), 0), + }, + "rewrite": { + "attempted": rewrite_attempted == 1, + "seconds": t_rw, + "exit_code": rewrite_rc, + }, + "sizes": { + "sync_app_bytes": pre_app_bytes, + "sync_wal_bytes": pre_wal_bytes, + "post_app_bytes": post_app_bytes, + "post_wal_bytes": post_wal_bytes, + }, + "metrics": { + "t_sync_seconds": t_sync, + "t_rewrite_seconds": t_rw, + "t_total_seconds": t_total, + "s_sync_app_bytes": pre_app_bytes, + "s_sync_wal_bytes": pre_wal_bytes, + "s_post_app_bytes": post_app_bytes, + "s_post_wal_bytes": post_wal_bytes, + "max_rss_kb": safe_int(sync.get("max_rss_kb"), 0), + }, + "maintenance_summary": maintenance, +} +out_path.write_text(json.dumps(result, indent=2, sort_keys=True), encoding="utf-8") +print(out_path) +PY + + echo "run_id=$run_id run_home=$run_home json=$run_json" +} + +aggregate_and_decide() { + local decision_json="$OUT/decision.json" + python3 - "$OUT" "$SIZE_TOLERANCE_BYTES" "$TIME_TOLERANCE_SECONDS" "$MIN_PAIRS" "$CLEAR_WIN_PAIRS" "$CLEAR_LOSS_PAIRS" "$MAX_PAIRS" "$STOP_ON_CLEAR" "$LOW_SIGNAL_MIN_PAIRS" "$LOW_SIGNAL_NEUTRAL_STREAK" "$decision_json" <<'PY' +import csv +import json +import sys +from pathlib import Path + +out = Path(sys.argv[1]) +size_tol = int(sys.argv[2]) +time_tol = int(sys.argv[3]) +min_pairs = int(sys.argv[4]) +clear_win_pairs = int(sys.argv[5]) +clear_loss_pairs = int(sys.argv[6]) +max_pairs = int(sys.argv[7]) +stop_on_clear = sys.argv[8] == "1" +low_signal_min_pairs = int(sys.argv[9]) +low_signal_neutral_streak = int(sys.argv[10]) +decision_path = Path(sys.argv[11]) + +run_files = sorted(out.glob("runs/*/run.json")) +runs = [] +for p in run_files: + try: + runs.append(json.loads(p.read_text(encoding="utf-8"))) + except Exception: + continue + +runs.sort(key=lambda r: (int(r.get("pair_index", 0)), str(r.get("variant", "")))) + +runs_csv = out / "runs.csv" +with runs_csv.open("w", newline="", encoding="utf-8") as fh: + w = csv.writer(fh) + w.writerow([ + "pair_index", + "variant", + "run_home", + "t_sync_seconds", + "t_rewrite_seconds", + "t_total_seconds", + "s_sync_app_bytes", + "s_sync_wal_bytes", + "s_post_app_bytes", + "s_post_wal_bytes", + "max_rss_kb", + "rewrite_exit_code", + "rewrite_runs", + "gc_runs", + "observed_gc_retry_queued", + "observed_gc_retry_dropped", + ]) + for r in runs: + m = r.get("metrics", {}) or {} + s = r.get("sizes", {}) or {} + rw = r.get("rewrite", {}) or {} + summary = r.get("maintenance_summary", {}) or {} + w.writerow([ + int(r.get("pair_index", 0)), + str(r.get("variant", "")), + str(r.get("run_home", "")), + m.get("t_sync_seconds"), + m.get("t_rewrite_seconds"), + m.get("t_total_seconds"), + s.get("sync_app_bytes"), + s.get("sync_wal_bytes"), + s.get("post_app_bytes"), + s.get("post_wal_bytes"), + m.get("max_rss_kb"), + rw.get("exit_code"), + summary.get("rewrite_runs", 0), + summary.get("gc_runs", 0), + summary.get("observed_gc_retry_queued", 0), + summary.get("observed_gc_retry_dropped", 0), + ]) + +by_pair: dict[int, dict[str, dict]] = {} +for r in runs: + pair = int(r.get("pair_index", 0)) + by_pair.setdefault(pair, {})[str(r.get("variant", ""))] = r + +pair_rows = [] +wins = 0 +losses = 0 +for pair in sorted(by_pair): + row = by_pair[pair] + ctrl = row.get("control") + cand = row.get("candidate") + if not ctrl or not cand: + continue + cm = cand.get("metrics", {}) or {} + bm = ctrl.get("metrics", {}) or {} + cand_total = cm.get("t_total_seconds") + base_total = bm.get("t_total_seconds") + cand_post_wal = cm.get("s_post_wal_bytes") + base_post_wal = bm.get("s_post_wal_bytes") + cand_sync = cm.get("t_sync_seconds") + base_sync = bm.get("t_sync_seconds") + cand_sync_app = cm.get("s_sync_app_bytes") + base_sync_app = bm.get("s_sync_app_bytes") + + def delta(a, b): + if a is None or b is None: + return None + return a - b + + d_total = delta(cand_total, base_total) + d_sync = delta(cand_sync, base_sync) + d_post_wal = delta(cand_post_wal, base_post_wal) + d_sync_app = delta(cand_sync_app, base_sync_app) + + outcome = "neutral" + if d_post_wal is not None and d_total is not None: + win = (d_post_wal <= -size_tol) and (d_total <= time_tol) + loss = (d_post_wal >= size_tol) and (d_total >= -time_tol) + if win and not loss: + outcome = "win" + wins += 1 + elif loss and not win: + outcome = "loss" + losses += 1 + + pair_rows.append({ + "pair_index": pair, + "delta_t_sync_seconds": d_sync, + "delta_t_total_seconds": d_total, + "delta_s_sync_app_bytes": d_sync_app, + "delta_s_post_wal_bytes": d_post_wal, + "outcome": outcome, + }) + +pairs_csv = out / "pairs.csv" +with pairs_csv.open("w", newline="", encoding="utf-8") as fh: + w = csv.writer(fh) + w.writerow([ + "pair_index", + "delta_t_sync_seconds", + "delta_t_total_seconds", + "delta_s_sync_app_bytes", + "delta_s_post_wal_bytes", + "outcome", + ]) + for r in pair_rows: + w.writerow([ + r["pair_index"], + r["delta_t_sync_seconds"], + r["delta_t_total_seconds"], + r["delta_s_sync_app_bytes"], + r["delta_s_post_wal_bytes"], + r["outcome"], + ]) + +completed_pairs = len(pair_rows) +neutral = max(0, completed_pairs - wins - losses) +neutral_streak = 0 +for row in reversed(pair_rows): + if row.get("outcome") == "neutral": + neutral_streak += 1 + continue + break + +reason = "continue" +stop = False +if stop_on_clear and completed_pairs >= min_pairs: + if wins >= clear_win_pairs and wins > losses: + stop = True + reason = "clear_improvement" + elif losses >= clear_loss_pairs and losses > wins: + stop = True + reason = "clear_regression" + else: + remaining = max(0, max_pairs - completed_pairs) + can_reach_clear_win = (wins + remaining) >= clear_win_pairs + can_reach_clear_loss = (losses + remaining) >= clear_loss_pairs + if not can_reach_clear_win and not can_reach_clear_loss: + stop = True + reason = "futile_remaining_pairs" + +if (not stop) and completed_pairs >= low_signal_min_pairs and neutral_streak >= low_signal_neutral_streak: + stop = True + reason = "low_signal_neutral_streak" + +if (not stop) and completed_pairs >= max_pairs: + stop = True + reason = "max_pairs" + +summary_md = out / "summary.md" +lines = [] +lines.append("# run_celestia A/B summary") +lines.append("") +lines.append(f"- completed pairs: `{completed_pairs}`") +lines.append(f"- wins/losses/neutral: `{wins}` / `{losses}` / `{neutral}`") +lines.append(f"- neutral streak (tail): `{neutral_streak}`") +lines.append(f"- size tolerance bytes: `{size_tol}`") +lines.append(f"- time tolerance seconds: `{time_tol}`") +lines.append(f"- low-signal min pairs: `{low_signal_min_pairs}`") +lines.append(f"- low-signal neutral streak: `{low_signal_neutral_streak}`") +lines.append(f"- decision: `{reason}`") +lines.append("") +lines.append("## Artifacts") +lines.append("") +lines.append(f"- runs csv: `{runs_csv}`") +lines.append(f"- pairs csv: `{pairs_csv}`") +lines.append(f"- per-run json: `{out / 'runs'}`") +if pair_rows: + last = pair_rows[-1] + lines.append("") + lines.append("## Last Pair") + lines.append("") + lines.append(f"- pair: `{last['pair_index']}` outcome=`{last['outcome']}`") + lines.append(f"- delta_t_sync_seconds: `{last['delta_t_sync_seconds']}`") + lines.append(f"- delta_t_total_seconds: `{last['delta_t_total_seconds']}`") + lines.append(f"- delta_s_sync_app_bytes: `{last['delta_s_sync_app_bytes']}`") + lines.append(f"- delta_s_post_wal_bytes: `{last['delta_s_post_wal_bytes']}`") +summary_md.write_text("\n".join(lines) + "\n", encoding="utf-8") + +payload = { + "completed_pairs": completed_pairs, + "wins": wins, + "losses": losses, + "neutral": neutral, + "neutral_streak": neutral_streak, + "stop": stop, + "reason": reason, +} +decision_path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8") +print(json.dumps(payload, sort_keys=True)) +PY +} + +run_pair() { + local pair_index="$1" + if (( pair_index % 2 == 1 )); then + run_variant "$pair_index" "control" "$CONTROL_ENV_FILE" + sleep "$SLEEP_BETWEEN_RUNS_SECONDS" + run_variant "$pair_index" "candidate" "$CANDIDATE_ENV_FILE" + else + run_variant "$pair_index" "candidate" "$CANDIDATE_ENV_FILE" + sleep "$SLEEP_BETWEEN_RUNS_SECONDS" + run_variant "$pair_index" "control" "$CONTROL_ENV_FILE" + fi +} + +echo "output=$OUT" +echo "run_cmd=$RUN_CMD" + +decision_reason="continue" +for ((pair = 1; pair <= MAX_PAIRS; pair++)); do + echo "pair=$pair start" + run_pair "$pair" + aggregate_and_decide + decision_reason="$(python3 - "$OUT/decision.json" <<'PY' +import json +import sys +payload = json.loads(open(sys.argv[1], 'r', encoding='utf-8').read()) +print(payload.get('reason', 'continue')) +print('1' if payload.get('stop') else '0') +PY +)" + stop_flag="$(echo "$decision_reason" | tail -n 1)" + decision_reason="$(echo "$decision_reason" | head -n 1)" + echo "pair=$pair decision=$decision_reason" + if [[ "$stop_flag" == "1" ]]; then + break + fi + sleep "$SLEEP_BETWEEN_RUNS_SECONDS" +done + +echo "completed decision=$decision_reason" +echo "summary=$OUT/summary.md" +echo "runs_csv=$OUT/runs.csv" +echo "pairs_csv=$OUT/pairs.csv" diff --git a/worklog/2026-03-27.md b/worklog/2026-03-27.md new file mode 100644 index 000000000..c4178103a --- /dev/null +++ b/worklog/2026-03-27.md @@ -0,0 +1,548 @@ +# Work Log - 2026-03-27 + +- Added live value-log maintenance observability for `run_celestia` investigation: + - `TreeDB/caching/db.go` + - maintenance gate counters: + - `treedb.cache.vlog_generation.maintenance.attempts` + - `treedb.cache.vlog_generation.maintenance.acquired` + - `treedb.cache.vlog_generation.maintenance.collisions` + - `treedb.cache.vlog_generation.maintenance.skip.*` + - `treedb.cache.vlog_generation.maintenance.passes.{noop,with_rewrite,with_gc}` + - vacuum skip counters: + - `treedb.cache.vlog_generation.vacuum.skipped_disabled` + - `treedb.cache.vlog_generation.vacuum.skipped_rewrite_bytes` + - `treedb.cache.vlog_generation.vacuum.skipped_cooldown` + - Added/updated tests in `TreeDB/caching/vlog_generation_scheduler_test.go` for: + - WAL-on periodic skip accounting + - maintenance collision accounting + - vacuum skip-reason accounting + +- Validation: + - `go test ./TreeDB/caching -count=1` + +- `run_celestia` validation run (fast profile, local gomap override): + - command: + - `LOCAL_GOMAP_DIR=/home/mikers/dev/snissn/gomap-phasehook-active TREEDB_OPEN_PROFILE=fast STOP_AT_LOCAL_HEIGHT=500 FREEZE_REMOTE_HEIGHT_AT_START=1 NO_PROGRESS_WARN_SECONDS=120 NO_PROGRESS_FAIL_SECONDS=1800 HEAP_CAPTURE_RSS_DELTA_KB=1 CAPTURE_HEAP_ON_MAX_RSS=1 ~/run_celestia.sh` + - run home: + - `/home/mikers/.celestia-app-mainnet-treedb-20260327173138` + - duration / stop condition: + - `duration_seconds=277` + - local-height stop target hit at `local=10413004` + - disk: + - `end_app_bytes=5011158649` + - `disk-breakdown.log` shows dominant `maindb/wal/value-l0-*` files (~256MiB each) + +- Latest diagnostics snapshot used for maintenance counters: + - file: + - `/home/mikers/.celestia-app-mainnet-treedb-20260327173138/sync/diagnostics/pprof-heap-max-rss-7070580k-20260327173541.treedb_vars.json` + - key counters: + - `maintenance.attempts=637` + - `maintenance.acquired=56` + - `maintenance.collisions=581` + - `maintenance.passes.noop=53` + - `maintenance.passes.with_gc=2` + - `maintenance.passes.with_rewrite=0` + - `maintenance.skip.quiet_window=26` + - `gc.runs=2` + - `rewrite.runs=0` + - `vacuum.runs=0` + - `scheduler_last_reason=periodic_gc` + +- Interpretation (for next slice): + - During this early state-sync window, rewrite did not trigger and therefore vacuum never became eligible on the post-rewrite path. + - The dominant scheduler behavior was active-pass contention (`collisions=581`) and noop acquired passes; this is now directly measurable. + +- Collision-coalescing follow-up (same day): + - code changes: + - `TreeDB/caching/db.go` + - `runVlogGenerationMaintenanceRetries`: when retry intent is already pending and `maintenanceActive` is true, wait/backoff instead of re-entering `maybeRun...` and creating repeated collisions. + - `maybeRunPeriodicVlogGenerationMaintenance`: skip periodic entry while `maintenanceActive` is true. + - `TreeDB/caching/vlog_generation_scheduler_test.go` + - added `TestRunVlogGenerationMaintenanceRetries_CoalescesPendingCollisionRetries`. + - validation: + - `go test ./TreeDB/caching -count=1` + +- `run_celestia` comparison rerun after collision-coalescing: + - command (same as prior comparison run): + - `LOCAL_GOMAP_DIR=/home/mikers/dev/snissn/gomap-phasehook-active TREEDB_OPEN_PROFILE=fast STOP_AT_LOCAL_HEIGHT=500 FREEZE_REMOTE_HEIGHT_AT_START=1 NO_PROGRESS_WARN_SECONDS=120 NO_PROGRESS_FAIL_SECONDS=1800 HEAP_CAPTURE_RSS_DELTA_KB=1 CAPTURE_HEAP_ON_MAX_RSS=1 ~/run_celestia.sh` + - run home: + - `/home/mikers/.celestia-app-mainnet-treedb-20260327174335` + - sampled snapshot: + - `pprof-heap-max-rss-6605092k-20260327174744.treedb_vars.json` + +- Baseline vs new counter delta (same metric keys/sampling style): + - baseline snapshot: `pprof-heap-max-rss-7070580k-20260327173541.treedb_vars.json` + - `maintenance.attempts`: `637 -> 124` + - `maintenance.acquired`: `56 -> 119` + - `maintenance.collisions`: `581 -> 5` + - `maintenance.passes.noop`: `53 -> 116` + - `maintenance.passes.with_rewrite`: `0 -> 1` + - `maintenance.passes.with_gc`: `2 -> 1` + - `rewrite.runs`: `0 -> 1` + - `vacuum.runs`: `0 -> 1` + +- Interpretation: + - Coalescing retry loops materially reduced collision churn and allowed at least one rewrite+vacuum pass to complete in the same early-state-sync lab window. + +- Periodic preflight follow-up: + - code changes: + - `TreeDB/caching/db.go` + - `maybeRunPeriodicVlogGenerationMaintenance`: added hot-foreground preflight (when not `runGC`) to skip entering maintenance unless deferred/checkpoint wake is pending. + - `TreeDB/caching/vlog_generation_scheduler_test.go` + - added `TestVlogGenerationMaintenance_PeriodicPreflightSkipsHotNoPending`. + - validation: + - `go test ./TreeDB/caching -count=1` + +- Third `run_celestia` comparison run (same command profile): + - run home: + - `/home/mikers/.celestia-app-mainnet-treedb-20260327175340` + - sampled snapshot: + - `pprof-heap-max-rss-6560568k-20260327175747.treedb_vars.json` + - key counters: + - `maintenance.attempts=46` + - `maintenance.acquired=41` + - `maintenance.collisions=5` + - `maintenance.passes.noop=38` + - `maintenance.passes.with_rewrite=1` + - `maintenance.passes.with_gc=1` + - `maintenance.skip.quiet_window=0` + - `rewrite.runs=1` + - `vacuum.runs=1` + +- Multi-run trend (same lab recipe): + - baseline snapshot: `attempts=637`, `collisions=581`, `rewrite=0`, `vacuum=0` + - collision-coalesced snapshot: `attempts=124`, `collisions=5`, `rewrite=1`, `vacuum=1` + - preflight snapshot: `attempts=46`, `collisions=5`, `rewrite=1`, `vacuum=1` + +- Interpretation: + - Retry coalescing delivered the major contention reduction. + - Periodic preflight further reduced maintenance churn/noop entries while preserving rewrite+vacuum progress in this early-state-sync window. + +- Instrumentation-first follow-up for incremental rewrite economics: + - code changes: + - `TreeDB/caching/db.go` + - added maintenance/rewrite/gc/vacuum duration counters and stats: + - `treedb.cache.vlog_generation.maintenance.pass.{total,max,avg}_ms` + - `treedb.cache.vlog_generation.rewrite.plan.{total,max,avg}_ms` + - `treedb.cache.vlog_generation.rewrite.exec.{total,max,avg}_ms` + - `treedb.cache.vlog_generation.gc.exec.{total,max,avg}_ms` + - `treedb.cache.vlog_generation.vacuum.exec.{total,max,avg}_ms` + - added rewrite backlog/debt visibility stats: + - `treedb.cache.vlog_generation.rewrite.ledger_segments` + - `treedb.cache.vlog_generation.rewrite.ledger_bytes_{total,live,stale}` + - `treedb.cache.vlog_generation.rewrite.ledger_stale_ratio_ppm` + - `treedb.cache.vlog_generation.rewrite.stage_{pending,observed_unix_nano}` + - `treedb.cache.vlog_generation.rewrite.penalties_active` + - `treedb.cache.vlog_generation.rewrite.age_blocked_{until_unix_nano,remaining_ms}` + - added rewrite budget execution stats: + - `treedb.cache.vlog_generation.rewrite_budget.tokens_{bytes,cap_bytes}` + - `treedb.cache.vlog_generation.rewrite_budget.tokens_utilization_pct` + - `treedb.cache.vlog_generation.rewrite_budget.consumed_bytes_total` + - tracked rewrite-budget token consumption inside `vlogGenerationConsumeRewriteBudgetBytes`. + - `TreeDB/caching/vlog_generation_scheduler_test.go` + - added `TestVlogGenerationStats_ReportRewriteBacklogAndDurations`. + +- Validation: + - `go test ./TreeDB/caching -run TestVlogGenerationStats_ReportRewriteBacklogAndDurations -count=1` + - `go test ./TreeDB/caching -count=1` + +- `run_celestia` instrumentation readout (application.db instance in expvar snapshots): + - run (`STOP_AT_LOCAL_HEIGHT=500`): + - home: `/home/mikers/.celestia-app-mainnet-treedb-20260327181906` + - key counters at peak snapshot: + - `maintenance.attempts=45`, `acquired=41`, `collisions=4` + - `rewrite.plan_runs=2`, `rewrite.plan_selected=2`, `rewrite.runs=1` + - `gc.runs=1`, `vacuum.runs=1` + - `rewrite_budget.consumed_bytes_total=33073153` + - offline compaction sanity check on that run: + - pre: `du -sb application.db = 4679915182` + - `treemap vlog-rewrite ... -rw` output: `segments_before=20 segments_after=15 bytes_before=4607146646 bytes_after=1983182186 records=957832` + - post: `du -sb application.db = 2021086813` + +- Longer-window stress run exposed retry-collision amplification: + - run (`STOP_AT_LOCAL_HEIGHT=2000`): + - home: `/home/mikers/.celestia-app-mainnet-treedb-20260327182903` + - timeline showed `acquired` flat while attempts/collisions spiked: + - snapshot progression reached `maintenance.attempts=333`, `collisions=304`, `acquired=29` + - `rewrite.plan_runs=3` but `rewrite.plan_selected=0`, `rewrite.runs=0` + - `checkpoint_kick.pending=true` persisted during collision growth + +- Fix for retry-collision amplification: + - code changes: + - `TreeDB/caching/db.go` + - `runVlogGenerationMaintenanceRetries`: when `maintenanceActive` is true, always back off/wait until release/deadline instead of conditionally attempting based on pending flags. + - This avoids high-frequency CAS collisions from checkpoint-kick retry goroutines while a long maintenance pass is active. + - `TreeDB/caching/vlog_generation_scheduler_test.go` + - extended `TestRunVlogGenerationMaintenanceRetries_CoalescesPendingCollisionRetries` with checkpoint-pending=false + active pass case to prevent regression. + - validation: + - `go test ./TreeDB/caching -run TestRunVlogGenerationMaintenanceRetries_CoalescesPendingCollisionRetries -count=1` + - `go test ./TreeDB/caching -count=1` + +- Confirmation run after fix: + - run (`STOP_AT_LOCAL_HEIGHT=2000`): + - home: `/home/mikers/.celestia-app-mainnet-treedb-20260327184030` + - comparable snapshot (`20260327184427`) vs pre-fix bad snapshot (`20260327183236`): + - `maintenance.attempts: 333 -> 38` + - `maintenance.acquired: 29 -> 38` + - `maintenance.collisions: 304 -> 0` + - `rewrite.plan_selected: 0 -> 2` + - `rewrite.runs: 0 -> 1` + - `vacuum.runs: 0 -> 1` + - `rewrite_budget.consumed_bytes_total: 0 -> 33073906` + +- Stage-gate/selection observability follow-up (live rewrite throughput diagnosis): + - code changes: + - `TreeDB/caching/db.go` + - added stage-gate split counters: + - `treedb.cache.vlog_generation.maintenance.skip.stage_gate_not_due` + - `treedb.cache.vlog_generation.maintenance.skip.stage_gate_due_reserved` + - added rewrite selection/execution segment counters: + - `treedb.cache.vlog_generation.rewrite.plan_selected_segments_total` + - `treedb.cache.vlog_generation.rewrite.exec.source_segments_total` + - incremented counters in: + - stage-gate early-return branches (`not_due` vs `due_reserved`) + - rewrite-plan outcome accounting (selected segments) + - rewrite execution completion (source segments executed) + - `TreeDB/caching/vlog_generation_scheduler_test.go` + - extended `TestVlogGenerationStats_ReportRewriteBacklogAndDurations` assertions for new keys. + +- Validation: + - `go test ./TreeDB/caching -count=1` + +- `run_celestia` run with new counters (baseline fast profile, no profile trigger override): + - command: + - `LOCAL_GOMAP_DIR=/home/mikers/dev/snissn/gomap-phasehook-active TREEDB_OPEN_PROFILE=fast FREEZE_REMOTE_HEIGHT_AT_START=1 STOP_AT_LOCAL_HEIGHT=2000 ~/run_celestia.sh` + - run home: + - `/home/mikers/.celestia-app-mainnet-treedb-20260327192126` + - final snapshot: + - `pprof-heap-max-rss-final-7983364k-20260327192636.treedb_vars.json` + - key counters: + - `maintenance.attempts=38`, `acquired=38`, `collisions=0` + - `rewrite.plan_runs=2`, `rewrite.plan_selected=2` + - `rewrite.plan_selected_segments_total=3` + - `rewrite.runs=1`, `rewrite.exec.source_segments_total=1` + - `rewrite.bytes_in=33073442`, `rewrite.reclaimed_bytes=0` + - `maintenance.skip.stage_gate=7` + - `maintenance.skip.stage_gate_not_due=7` + - `maintenance.skip.stage_gate_due_reserved=0` + - interpretation: + - planner selected more segment debt than was executed in-run (`3 selected vs 1 executed`). + - stage gating was entirely waiting-for-confirmation (`not_due`), not due-slot reservation. + +- Offline rewrite delta for same run home: + - pre: `du -sb application.db = 4707839386` + - `treemap vlog-rewrite ... -rw` output: + - `segments_before=20 segments_after=16 bytes_before=4637168004 bytes_after=2039183405 records=964467` + - post: `du -sb application.db = 2077350273` + - interpretation: + - live run still leaves substantial reclaimable headroom; new counters indicate confirmation-gated debt progression as one concrete limiter. + +- Stage-confirm rewrite progression experiment (post-observability): + - hypothesis: + - live rewrite debt was bottlenecked by stage-confirm overlap collapse + single-segment execution, visible as `plan_selected_segments_total > rewrite.exec.source_segments_total`. + - code changes: + - `TreeDB/caching/db.go` + - `vlogGenerationRewriteMaxSegmentsForRun`: + - keep checkpoint-kick (`bypassQuiet && !skipCheckpoint`) capped to single-segment, + - but allow stage-confirm / age-blocked deferred sources to use bounded debt-drain sizing. + - rewrite execution path when `haveRewritePlan`: + - allow debt-drain sizing for confirmed `rewrite_resume` plans instead of forcing single-segment. + - stale-ratio staged confirmation handling: + - once overlap confirms stability, execute current sparse plan rather than filtering to overlap-only subset. + - `TreeDB/caching/vlog_generation_scheduler_test.go` + - added `TestVlogGenerationRewritePlan_StageConfirmationDebtDrainProcessesMultipleSegments`. + +- Validation: + - `go test ./TreeDB/caching -run 'TestVlogGenerationRewritePlan_StageConfirmationDebtDrainProcessesMultipleSegments|TestVlogGenerationRewriteQueue_CheckpointKickDebtDrainCapsSingleSegment|TestVlogGenerationRewriteQueue_DebtDrainProcessesMultipleSegments|TestVlogGenerationRewritePlan_StageConfirmationExecutesConfirmedSubset' -count=1` + - `go test ./TreeDB/caching -count=1` + - `go test ./TreeDB -count=1` + +- `run_celestia` comparison (same profile/height target): + - command: + - `LOCAL_GOMAP_DIR=/home/mikers/dev/snissn/gomap-phasehook-active TREEDB_OPEN_PROFILE=fast FREEZE_REMOTE_HEIGHT_AT_START=1 STOP_AT_LOCAL_HEIGHT=5000 ~/run_celestia.sh` + - pre-change reference home: + - `/home/mikers/.celestia-app-mainnet-treedb-20260327194254` + - final snapshot: `pprof-heap-max-rss-final-6937148k-20260327194740.treedb_vars.json` + - key counters: + - `plan_selected_segments_total=3` + - `rewrite.exec.source_segments_total=1` + - `rewrite.runs=1` + - `rewrite.bytes_in=33081912` + - `rewrite.reclaimed_bytes=0` + - post-change run home: + - `/home/mikers/.celestia-app-mainnet-treedb-20260327195053` + - final snapshot: `pprof-heap-max-rss-7548312k-20260327195551.treedb_vars.json` + - key counters: + - `plan_selected_segments_total=3` + - `rewrite.exec.source_segments_total=2` + - `rewrite.runs=1` + - `rewrite.bytes_in=38276046` + - `rewrite.reclaimed_bytes=0` + +- offline rewrite deltas (headroom remains large): + - pre-change home `/home/mikers/.celestia-app-mainnet-treedb-20260327194254`: + - pre: `4645103594` + - post: `2016029251` + - tool output: `segments_before=20 segments_after=15 bytes_before=4583082964 bytes_after=1978124746 records=956586` + - post-change home `/home/mikers/.celestia-app-mainnet-treedb-20260327195053`: + - pre: `4653743667` + - post: `2022437394` + - tool output: `segments_before=20 segments_after=15 bytes_before=4598014513 bytes_after=1984532899 records=958463` + +- interpretation: + - stage-confirm policy change increased in-run rewritten source segments (`1 -> 2`) in a comparable 5000-height window. + - immediate live reclaim remains `0`, and offline compaction still cuts ~2.6 GiB, so major headroom remains. + +- No-reclaim diagnostics instrumentation for live rewrite: + - code changes: + - `TreeDB/caching/db.go` + - added rewrite economics counters: + - `treedb.cache.vlog_generation.rewrite.processed_live_bytes` + - `treedb.cache.vlog_generation.rewrite.processed_stale_bytes` + - `treedb.cache.vlog_generation.rewrite.no_reclaim_runs` + - `treedb.cache.vlog_generation.rewrite.no_reclaim_stale_bytes` + - counters update in rewrite execution path: + - accumulate processed live/stale bytes from processed ledger chunk + - mark `no_reclaim_runs` when rewrite copied stale debt but global bytes did not fall in-pass + - `TreeDB/caching/vlog_generation_scheduler_test.go` + - extended `TestVlogGenerationStats_ReportRewriteBacklogAndDurations` assertions for new keys. + +- Validation: + - `go test ./TreeDB/caching -run 'TestVlogGenerationStats_ReportRewriteBacklogAndDurations|TestVlogGenerationRewritePlan_StageConfirmationDebtDrainProcessesMultipleSegments|TestVlogGenerationRewriteQueue_CheckpointKickDebtDrainCapsSingleSegment' -count=1` + - `go test ./TreeDB/caching -count=1` + - `go test ./TreeDB -count=1` + +- `run_celestia` readout with new counters: + - command: + - `LOCAL_GOMAP_DIR=/home/mikers/dev/snissn/gomap-phasehook-active TREEDB_OPEN_PROFILE=fast FREEZE_REMOTE_HEIGHT_AT_START=1 STOP_AT_LOCAL_HEIGHT=5000 ~/run_celestia.sh` + - run home: + - `/home/mikers/.celestia-app-mainnet-treedb-20260327201801` + - final snapshot: + - `pprof-heap-max-rss-final-7767828k-20260327202312.treedb_vars.json` + - key counters: + - `plan_selected_segments_total=3` + - `rewrite.exec.source_segments_total=2` + - `rewrite.runs=1` + - `rewrite.bytes_in=38292854` + - `rewrite.processed_live_bytes=38292854` + - `rewrite.processed_stale_bytes=498581006` + - `rewrite.no_reclaim_runs=1` + - `rewrite.no_reclaim_stale_bytes=498581006` + - `rewrite.reclaimed_bytes=0` + - `gc.deleted_bytes=0` + +- interpretation: + - live rewrite now clearly reports that substantial stale payload was processed in-pass (~498 MiB) with zero immediate reclaim, confirming reclaim is blocked/deferred downstream of selection+copy. + +- offline rewrite sanity check for same run: + - pre: `4747763395` + - post: `2064528109` + - tool output: `segments_before=20 segments_after=16 bytes_before=4674175679 bytes_after=2026328485 records=963752` + +- GC blocker classification instrumentation (follow-up to no-reclaim counters): + - goal: + - make no-reclaim episodes diagnosable in one snapshot by showing whether bytes are blocked by active/pinned/protected classes vs actually eligible but pending delete. + - code changes: + - `TreeDB/db/vlog_gc.go` + - extended `ValueLogGCStats` with: + - `SegmentsPending` + - `BytesPending` + - populated pending values after delete attempts as `eligible - deleted` when positive. + - `TreeDB/caching/db.go` + - added cached per-run GC classification fields to `DB` atomics. + - added `observeVlogGenerationGCStats(...)` and wired it into both: + - post-rewrite GC pass + - periodic GC pass + - exported new stats keys: + - `treedb.cache.vlog_generation.gc.last_referenced_segments/bytes` + - `treedb.cache.vlog_generation.gc.last_active_segments/bytes` + - `treedb.cache.vlog_generation.gc.last_protected_segments/bytes` + - `treedb.cache.vlog_generation.gc.last_eligible_segments/bytes` + - `treedb.cache.vlog_generation.gc.last_deleted_segments/bytes` + - `treedb.cache.vlog_generation.gc.last_pending_segments/bytes` + - `TreeDB/caching/vlog_generation_scheduler_test.go` + - extended `TestVlogGenerationStats_ReportRewriteBacklogAndDurations` with assertions for all new keys. + +- Validation: + - `go test ./TreeDB/db ./TreeDB/caching -count=1` + +- Corrected `run_celestia` validation after adding `gc.last_*` stats: + - initial rerun with `STOP_AT_LOCAL_HEIGHT=5000` was invalid for maintenance analysis because the script treats it as an absolute local-height target; after state sync jump to ~10.4M, it exited immediately with no rewrite activity. + - corrected run command: + - `LOCAL_GOMAP_DIR=/home/mikers/dev/snissn/gomap-phasehook-active TREEDB_OPEN_PROFILE=fast FREEZE_REMOTE_HEIGHT_AT_START=1 ~/run_celestia.sh` + - run home: + - `/home/mikers/.celestia-app-mainnet-treedb-20260327205149` + - note on diagnostics source: + - per-instance application counters were read from `*.debug_vars.json` at: + - `.treedb.instances[".../data/application.db/maindb/wal#..."]` + - `*.treedb_application_vars.json` was `{}` in this run, so instance readout is the reliable source. + +- Final application-instance counters (`pprof-heap-max-rss-final-11027988k-20260327205709.debug_vars.json`): + - rewrite: + - `rewrite.plan_selected_segments_total=3` + - `rewrite.exec.source_segments_total=2` + - `rewrite.runs=1` + - `rewrite.bytes_in=38292854` + - `rewrite.processed_live_bytes=38292854` + - `rewrite.processed_stale_bytes=498582443` + - `rewrite.no_reclaim_runs=1` + - `rewrite.no_reclaim_stale_bytes=498582443` + - `rewrite.reclaimed_bytes=0` + - gc/classification: + - `gc.runs=1` + - `gc.deleted_bytes=0` + - `gc.last_referenced_segments=8` + - `gc.last_referenced_bytes=1294769679` + - `gc.last_active_segments=0` + - `gc.last_active_bytes=0` + - `gc.last_protected_segments=2` + - `gc.last_protected_bytes=536875297` + - `gc.last_eligible_segments=0` + - `gc.last_eligible_bytes=0` + - `gc.last_deleted_segments=0` + - `gc.last_deleted_bytes=0` + - `gc.last_pending_segments=0` + - `gc.last_pending_bytes=0` + - maintenance: + - `maintenance.attempts=35` + - `maintenance.acquired=35` + - `maintenance.collisions=0` + +- Interpretation: + - this run confirms stale bytes are being copied by live rewrite, but immediate reclaim is blocked because the final GC view reports `eligible=0` (not delete failure/pending). + - blocker class in this sample is dominated by `referenced + protected` bytes, not active segment pinning and not eligible-but-pending deletion. + +- Offline reclaim headroom on the same run home: + - command: + - `/home/mikers/dev/snissn/celestia-app-p4/build/treemap-local vlog-rewrite /home/mikers/.celestia-app-mainnet-treedb-20260327205149/data/application.db -rw` + - pre: `5035136550` + - tool output: `segments_before=21 segments_after=16 bytes_before=4888181282 bytes_after=2076143228 records=985926` + - post: `2115096516` + +- GC protected-class split instrumentation (retained vs in-use vs overlap): + - motivation: + - prior `gc.last_protected_{segments,bytes}` proved protection was the blocker class but did not identify whether protection came from in-memory in-use paths vs retained-path lifecycle pins. + - code changes: + - `TreeDB/db/vlog_gc.go` + - `ValueLogGCOptions` extended with: + - `ProtectedInUsePaths []string` + - `ProtectedRetainedPaths []string` + - `ValueLogGCStats` extended with protected split buckets: + - `SegmentsProtectedInUse`, `BytesProtectedInUse` + - `SegmentsProtectedRetained`, `BytesProtectedRetained` + - `SegmentsProtectedOverlap`, `BytesProtectedOverlap` + - `SegmentsProtectedOther`, `BytesProtectedOther` + - GC classification now tags protected candidates by class while preserving `SegmentsProtected/BytesProtected` totals. + - protected-lane recent-window keep logic now uses the union of legacy + split protected path lists. + - `TreeDB/caching/db.go` + - added helper `valueLogGCProtectedPathSets()` and `valueLogGCOptions(dryRun bool)` to pass split path sets into backend GC. + - `observeVlogGenerationGCStats` now records split protected classes. + - exported new stats keys: + - `treedb.cache.vlog_generation.gc.last_protected_in_use_{segments,bytes}` + - `treedb.cache.vlog_generation.gc.last_protected_retained_{segments,bytes}` + - `treedb.cache.vlog_generation.gc.last_protected_overlap_{segments,bytes}` + - `treedb.cache.vlog_generation.gc.last_protected_other_{segments,bytes}` + - tests: + - `TreeDB/db/vlog_gc_test.go` + - added `TestValueLogGC_ProtectedPathBreakdownStats` + - `TreeDB/caching/vlog_generation_scheduler_test.go` + - extended `TestVlogGenerationStats_ReportRewriteBacklogAndDurations` assertions for new keys. + +- Validation: + - `go test ./TreeDB/db -run 'TestValueLogGC_ProtectedPathsDoNotKeepHistoricalRewriteLanes|TestValueLogGC_ProtectedPathBreakdownStats' -count=1` + - `go test ./TreeDB/caching -run 'TestVlogGenerationStats_ReportRewriteBacklogAndDurations' -count=1` + - `go test ./TreeDB/db ./TreeDB/caching -count=1` + +- Live confirmation run for protected split counters: + - command: + - `LOCAL_GOMAP_DIR=/home/mikers/dev/snissn/gomap-phasehook-active TREEDB_OPEN_PROFILE=fast FREEZE_REMOTE_HEIGHT_AT_START=1 ~/run_celestia.sh` + - run home: + - `/home/mikers/.celestia-app-mainnet-treedb-20260327211238` + - final debug snapshot: + - `pprof-heap-max-rss-final-11733668k-20260327211812.debug_vars.json` + - application-instance counters: + - `rewrite.plan_selected_segments_total=3` + - `rewrite.exec.source_segments_total=2` + - `rewrite.runs=1` + - `rewrite.processed_stale_bytes=498580183` + - `rewrite.reclaimed_bytes=0` + - `gc.runs=1` + - `gc.deleted_bytes=0` + - `gc.last_protected_segments=2` + - `gc.last_protected_bytes=536873037` + - `gc.last_protected_in_use_segments=0` + - `gc.last_protected_in_use_bytes=0` + - `gc.last_protected_retained_segments=2` + - `gc.last_protected_retained_bytes=536873037` + - `gc.last_protected_overlap_segments=0` + - `gc.last_protected_overlap_bytes=0` + - `gc.last_protected_other_segments=0` + - `gc.last_protected_other_bytes=0` + - `gc.last_eligible_segments=0` + - `gc.last_eligible_bytes=0` + +- Interpretation update: + - for this run window, no-reclaim is attributable to retained-path protection (not in-use protection and not eligible/pending delete). + +- Offline headroom on same run home: + - command: + - `/home/mikers/dev/snissn/celestia-app-p4/build/treemap-local vlog-rewrite /home/mikers/.celestia-app-mainnet-treedb-20260327211238/data/application.db -rw` + - pre: `5266839216` + - tool output: `segments_before=22 segments_after=16 bytes_before=4993530542 bytes_after=2108841030 records=995454` + - post: `2148318606` + +## Retained-Prune Scheduling Instrumentation + Force Preemption (late update) + +- Code updates: + - `TreeDB/caching/db.go` + - retained-prune now exports explicit counters: + - `treedb.cache.vlog_retained_prune.closed_bytes` + - `treedb.cache.vlog_retained_prune.pressure_bytes` + - `treedb.cache.vlog_retained_prune.schedule_requests` + - `treedb.cache.vlog_retained_prune.schedule_forced_requests` + - `treedb.cache.vlog_retained_prune.schedule_skip.{closing,inflight,no_closed_bytes,below_pressure,min_interval}` + - `treedb.cache.vlog_retained_prune.force_pending` + - plus run/outcome counters (`runs`, `forced_runs`, `foreground_abort_runs`, `removed_*`). + - expvar now exports `treedb.cache.vlog_retained_prune.*` via allowlist. + - forced retained-prune requests can preempt a currently inflight quiet-window wait (instead of being starved by `schedule_skip.inflight`). + - retained-prune force trigger from GC-after-rewrite/periodic-GC is gated on `valueLogRetainedClosedBytes > 0`. + - `vlog_generation.gc.{runs,deleted_*}` accounting is updated in post-rewrite GC path as well. + - open-path retained closed-byte initialization includes existing retained segments found at startup. + - `TreeDB/caching/db_test.go` + - added: + - `TestOpen_InitializesRetainedClosedBytesFromExistingSegments` + - `TestRetainedValueLogPruneForce_BypassesPressureThreshold` + - `TestRetainedValueLogPruneForce_PreemptsQuietWait` + - extended `TestCheckpoint_SkipsRetainedValueLogPruneBelowPressureThreshold` with schedule/closed-byte assertions. + - `TreeDB/caching/expvar_stats_test.go` + - selector test now verifies retained-prune family inclusion. + +- Validation: + - focused: + - `go test ./TreeDB/caching -run 'TestSelectTreeDBExpvarStatsFiltersAndCoerces|TestOpen_InitializesRetainedClosedBytesFromExistingSegments|TestRetainedValueLogPruneForce_BypassesPressureThreshold|TestRetainedValueLogPruneForce_PreemptsQuietWait|TestCheckpoint_SkipsRetainedValueLogPruneBelowPressureThreshold|TestCheckpoint_DoesNotWaitForPriorRetainedValueLogPrune|TestCheckpoint_SchedulesRetainedValueLogPruneAsynchronously|TestCheckpoint_DefersRetainedValueLogPruneUntilForegroundQuiet|TestRetainedValueLogPrune_AbortsWhenForegroundWritesResume|TestCheckpoint_RateLimitsRetainedValueLogPrune|TestBackendMaintenance_DoesNotBlockOnRetainedValueLogPruneQuietWindow|TestVlogGenerationStats_ReportRewriteBacklogAndDurations' -count=1` + - full: + - `go test ./TreeDB/caching ./TreeDB/db -count=1` + +- Live run readouts: + - run home: `/home/mikers/.celestia-app-mainnet-treedb-20260327222148` + - snapshot: `pprof-heap-max-rss-final-11238156k-20260327222731.debug_vars.json` + - application instance: + - `rewrite.runs=1`, `rewrite.processed_stale_bytes=498581053`, `rewrite.reclaimed_bytes=0` + - `gc.runs=2`, `gc.last_protected_retained_bytes=536873907`, `gc.last_eligible_bytes=0` + - `retained_prune.closed_bytes=5100295854` + - `retained_prune.pressure_bytes=17179869164` + - `retained_prune.schedule_requests=1551` + - `retained_prune.schedule_forced_requests=1` + - `retained_prune.schedule_skip.inflight=1549` + - `retained_prune.runs=0` + - interpretation: + - before force-preemption fix, one inflight quiet-window worker starved later forced request. + - run home: `/home/mikers/.celestia-app-mainnet-treedb-20260327223633` + - snapshot: `pprof-heap-max-rss-8938296k-20260327224120.debug_vars.json` + - application instance: + - `rewrite.runs=0` (forced path not exercised on this short window) + - `retained_prune.closed_bytes=4563428411` + - `retained_prune.schedule_requests=185` + - `retained_prune.schedule_forced_requests=0` + - `retained_prune.schedule_skip.inflight=183` + - `retained_prune.runs=0` diff --git a/worklog/2026-03-28.md b/worklog/2026-03-28.md new file mode 100644 index 000000000..5db2ed86a --- /dev/null +++ b/worklog/2026-03-28.md @@ -0,0 +1,330 @@ +# 2026-03-28 + +- Added a repeatable live-maintenance capacity analyzer: + - `scripts/analyze_vlog_maintenance_capacity.py` + - Input modes: + - latest run home auto-discovery (default) + - explicit run home dir + - explicit diagnostics JSON snapshot + - Prefers `*.debug_vars.json` snapshots and the `application.db` instance in multi-instance payloads. + - Emits derived signals for: + - maintenance lane pressure (attempt/acquire/collision + skip mix) + - rewrite plan select rate and selected->executed realization + - selected stale bytes vs processed stale bytes + - immediate reclaim ratio + - observed-source replay queue drain + - GC eligibility/protection summary + +- Updated runbook docs with command + usage: + - `docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md` + - Added `Live Run Capacity Report` section. + +- Validation run (existing Celestia home): + - command: + - `./scripts/analyze_vlog_maintenance_capacity.py /home/mikers/.celestia-app-mainnet-treedb-20260328050437` + - key outputs: + - `maintenance attempts/acquired/collisions = 74 / 74 / 0` + - `rewrite plan runs/selected/empty = 9 / 4 / 5` + - `selected->executed segments = 14 -> 7 (50.0%)` + - `selected stale -> processed stale = 2.91 GiB -> 1.46 GiB (50.0%)` + - `rewrite reclaimed bytes = 0 B` with `processed stale = 1.46 GiB` + - `observed-source queued/taken/pending ids = 29 / 29 / 0` + - interpretation: + - forced observed-source replay now drains cleanly, but the dominant remaining bottleneck is still zero immediate reclaim despite substantial stale rewrite processing. + +- Live run: higher rewrite budget pass (same fast profile) + - command: + - `LOCAL_GOMAP_DIR=/home/mikers/dev/snissn/gomap-phasehook-active TREEDB_OPEN_PROFILE=fast TREEDB_VLOG_REWRITE_BUDGET_BYTES_PER_SEC=536870912 FREEZE_REMOTE_HEIGHT_AT_START=1 ~/run_celestia.sh` + - run home: + - `/home/mikers/.celestia-app-mainnet-treedb-20260328052003` + - analyzer readout: + - `./scripts/analyze_vlog_maintenance_capacity.py /home/mikers/.celestia-app-mainnet-treedb-20260328052003` + - delta vs prior run (`/home/mikers/.celestia-app-mainnet-treedb-20260328050437`): + - `rewrite.exec.source_segments_total`: `7 -> 8` + - `rewrite.segment_realization_pct`: `50.0% -> 61.5%` + - `rewrite.processed_stale_bytes`: `1.46 GiB -> 1.60 GiB` + - `rewrite.bytes_in`: `300.41 MiB -> 408.30 MiB` + - `rewrite exec throughput`: `9.72 MiB/s -> 13.22 MiB/s` + - `rewrite.reclaimed_bytes`: stayed `0 B` + - interpretation: + - Higher budget improves rewrite execution throughput and plan-to-exec realization, but does not solve the core in-run reclaim issue (GC eligibility still zero at final snapshot). + +- Post-run offline rewrite on the higher-budget home: + - command: + - `/home/mikers/dev/snissn/celestia-app-p4/build/treemap-local vlog-rewrite /home/mikers/.celestia-app-mainnet-treedb-20260328052003/data/application.db -rw` + - output: + - `vlog-rewrite: segments_before=16 segments_after=17 bytes_before=3496705485 bytes_after=2168049697 records=1011649` + - post-rewrite size/gzip: + - `du -sb`: `2208117397` + - `tar|gzip|wc -c`: `1781585169` + +- Added observed-source GC cumulative totals to TreeDB stats + analyzer: + - new stats keys: + - `treedb.cache.vlog_generation.observed_gc.source_segments_total` + - `treedb.cache.vlog_generation.observed_gc.source_segments_eligible_total` + - `treedb.cache.vlog_generation.observed_gc.source_segments_deleted_total` + - `treedb.cache.vlog_generation.observed_gc.source_bytes_total` + - `treedb.cache.vlog_generation.observed_gc.source_bytes_eligible_total` + - `treedb.cache.vlog_generation.observed_gc.source_bytes_deleted_total` + - `treedb.cache.vlog_generation.observed_gc.source_bytes_protected_retained_total` + - analyzer now reports observed-source cumulative eligible/deleted percentages. + +- Validation run with forced rewrite trigger to exercise observed-source path: + - command: + - `LOCAL_GOMAP_DIR=/home/mikers/dev/snissn/gomap-phasehook-active TREEDB_OPEN_PROFILE=fast TREEDB_VLOG_REWRITE_BUDGET_BYTES_PER_SEC=536870912 TREEDB_VLOG_REWRITE_TRIGGER_TOTAL_BYTES=1073741824 FREEZE_REMOTE_HEIGHT_AT_START=1 ~/run_celestia.sh` + - run home: + - `/home/mikers/.celestia-app-mainnet-treedb-20260328054206` + - analyzer highlights: + - rewrite: `plan_selected_segments_total=2`, `exec.source_segments_total=2`, `processed_stale_bytes=475.06 MiB` + - `rewrite.reclaimed_bytes=0` + - observed replay queue: `queued/taken/pending ids = 14 / 14 / 0` + - observed-source totals: + - segments: `total=12 eligible=0 deleted=0` + - bytes: `total=3.00 GiB eligible=0 B deleted=0 B protected_retained=3.00 GiB` + - retained-prune summary: + - `runs=2 forced=2 closed=4.75 GiB` + - `candidates=7 (1.75 GiB) removed=0` + - skips: `in_use=6`, `live=5 (1.25 GiB)`, `zombie_marked=2 (512 MiB)` + - interpretation: + - This confirms the bottleneck signature in-run is observed-source bytes remaining retained-protected (never becoming GC-eligible in the measured window), not queue drain failure. + +- Post-run offline checks on same home: + - `vlog-gc -rw`: + - `segments total=22 referenced=22 eligible=0 deleted=0 bytes_total=4737495161 bytes_eligible=0 bytes_deleted=0` + - `vlog-rewrite -rw`: + - `segments_before=22 segments_after=17 bytes_before=4737495161 bytes_after=2199392731 records=1021293` + - post-rewrite size/gzip: + - `du -sb`: `2239722809` + - `tar|gzip|wc -c`: `1805021465` + +- Added retained-prune observed-source cumulative counters (not just last-run snapshot): + - `treedb.cache.vlog_retained_prune.observed_source.segments_total` + - `treedb.cache.vlog_retained_prune.observed_source.bytes_total` + - `treedb.cache.vlog_retained_prune.observed_source.segments_candidate_total` + - `treedb.cache.vlog_retained_prune.observed_source.bytes_candidate_total` + - `treedb.cache.vlog_retained_prune.observed_source.segments_removed_total` + - `treedb.cache.vlog_retained_prune.observed_source.bytes_removed_total` + - `treedb.cache.vlog_retained_prune.observed_source.segments_in_use_skipped_total` + - `treedb.cache.vlog_retained_prune.observed_source.bytes_in_use_skipped_total` + - `treedb.cache.vlog_retained_prune.observed_source.segments_live_skipped_total` + - `treedb.cache.vlog_retained_prune.observed_source.bytes_live_skipped_total` + - `treedb.cache.vlog_retained_prune.observed_source.segments_parse_skipped_total` + - `treedb.cache.vlog_retained_prune.observed_source.bytes_parse_skipped_total` + - `treedb.cache.vlog_retained_prune.observed_source.segments_zombie_marked_total` + - `treedb.cache.vlog_retained_prune.observed_source.bytes_zombie_marked_total` + +- Added value-log zombie inventory stats from manager: + - `treedb.cache.vlog_zombie.segments` + - `treedb.cache.vlog_zombie.bytes` + - `treedb.cache.vlog_zombie.pinned_segments` + - `treedb.cache.vlog_zombie.pinned_bytes` + - `treedb.cache.vlog_zombie.unpinned_segments` + - `treedb.cache.vlog_zombie.unpinned_bytes` + - plus process-memory estimates for zombie bytes. + +- Analyzer/report updates: + - Include rewrite source outcomes (`requested/still_referenced/unreferenced`). + - Include observed-source retained-prune cumulative outcomes. + - Include zombie inventory (pinned vs unpinned bytes). + - Add signal note when rewrite-selected segments become unreferenced and zombie-marked while GC delete counters remain zero. + +- Validation run (`fast`, forced trigger, new counters) + - command: + - `LOCAL_GOMAP_DIR=/home/mikers/dev/snissn/gomap-phasehook-active TREEDB_OPEN_PROFILE=fast TREEDB_VLOG_REWRITE_BUDGET_BYTES_PER_SEC=536870912 TREEDB_VLOG_REWRITE_TRIGGER_TOTAL_BYTES=134217728 FREEZE_REMOTE_HEIGHT_AT_START=1 ~/run_celestia.sh` + - run home: + - `/home/mikers/.celestia-app-mainnet-treedb-20260328063722` + - analyzer highlights: + - rewrite source outcomes: `requested_total=4 unreferenced_total=4 still_referenced_total=0` + - observed-source retained-prune totals: `seen=4 (1.00 GiB), candidate=4 (1.00 GiB), zombie_marked=4 (1.00 GiB), live_skipped=0` + - observed-source GC cumulative: `total=3.50 GiB, eligible=0, deleted=0, protected_retained=3.50 GiB` + - retained-prune global: `zombie_marked=4 (1.00 GiB)` + - interpretation: + - rewrite-selected source segments are becoming unreferenced and are then being zombie-marked in retained-prune; replay queue is draining. + - zero observed-source GC deleted bytes is not explained by queue starvation or live-skips on observed sources. + +- Second validation run with zombie inventory keys active: + - run home: + - `/home/mikers/.celestia-app-mainnet-treedb-20260328065017` + - analyzer highlights: + - rewrite source outcomes: `requested_total=4 unreferenced_total=4 still_referenced_total=0` + - observed-source retained-prune totals: `seen=4, candidate=4, zombie_marked=4, removed=0, live_skipped=0` + - zombie inventory at final snapshot: `total=0, pinned=0, unpinned=0` + - observed-source GC cumulative still `eligible=0 deleted=0 protected_retained=2.75 GiB` + - interpretation: + - observed-source segments are zombie-marked and eventually not present as tracked zombies by run end, yet GC delete counters remain zero; this indicates reclaim is occurring outside current GC deleted-byte accounting and that the larger disk gap is primarily about how much stale data live rewrite selected during the run. + +- Headroom check on same run (`20260328065017`) via offline rewrite: + - pre: `du -sb maindb/wal = 3805802931` + - command: + - `/home/mikers/dev/snissn/celestia-app-p4/build/treemap-local vlog-rewrite /home/mikers/.celestia-app-mainnet-treedb-20260328065017/data/application.db -rw` + - output: + - `vlog-rewrite: segments_before=20 segments_after=16 bytes_before=3805798835 bytes_after=2068426925 records=983187` + - post: `du -sb maindb/wal = 2068431021` + - implication: + - ~1.74 GiB additional compaction headroom remains versus end-of-live-run size under this workload. + +- Stale-ratio trigger sweep (live run_celestia) to isolate rewrite-selection threshold impact: + - low stale ratio path (forces ~0.50 segment threshold): + - command: + - `LOCAL_GOMAP_DIR=/home/mikers/dev/snissn/gomap-phasehook-active TREEDB_OPEN_PROFILE=fast TREEDB_VLOG_REWRITE_BUDGET_BYTES_PER_SEC=536870912 TREEDB_VLOG_REWRITE_TRIGGER_TOTAL_BYTES=0 TREEDB_VLOG_REWRITE_TRIGGER_STALE_RATIO_PPM=100000 FREEZE_REMOTE_HEIGHT_AT_START=1 ~/run_celestia.sh` + - run home: `/home/mikers/.celestia-app-mainnet-treedb-20260328070649` + - analyzer highlights: + - `selected->executed segments = 13 -> 7` + - `processed_stale_bytes = 1.51 GiB` + - end WAL: `3093987987` + - offline rewrite on same home: `3093983891 -> 2128313686` (`du -sb` post `2128317782`) + - high stale ratio control (~0.85 threshold): + - command: + - `... TREEDB_VLOG_REWRITE_TRIGGER_TOTAL_BYTES=0 TREEDB_VLOG_REWRITE_TRIGGER_STALE_RATIO_PPM=850000 ...` + - run home: `/home/mikers/.celestia-app-mainnet-treedb-20260328071500` + - analyzer highlights: + - `selected->executed segments = 4 -> 4` + - `processed_stale_bytes = 983.89 MiB` + - end WAL: `3944887635` + - interpretation: + - lower stale-threshold selection materially improves in-run compaction and closes offline headroom. + +- Code change: allow explicitly configured stale-ratio trigger to drive generic/total-bytes rewrite segment selection threshold. + - file: `TreeDB/caching/db.go` + - changed `vlogGenerationRewriteMinStaleRatioForGenericPass` so when `TREEDB_VLOG_REWRITE_TRIGGER_STALE_RATIO_PPM` is set, generic planning uses that configured threshold (with existing stale-ratio floor behavior), instead of always enforcing the stricter generic constant. + - default behavior remains unchanged when stale-ratio trigger is unset. + - tests updated in `TreeDB/caching/vlog_generation_scheduler_test.go`: + - generic pass uses configured trigger ratio when set + - queued debt under total-bytes reflects configured ratio + - default generic ratio remains unchanged when trigger ratio is unset + +- Validation run after code change with both triggers enabled: + - command: + - `LOCAL_GOMAP_DIR=/home/mikers/dev/snissn/gomap-phasehook-active TREEDB_OPEN_PROFILE=fast TREEDB_VLOG_REWRITE_BUDGET_BYTES_PER_SEC=536870912 TREEDB_VLOG_REWRITE_TRIGGER_TOTAL_BYTES=134217728 TREEDB_VLOG_REWRITE_TRIGGER_STALE_RATIO_PPM=100000 FREEZE_REMOTE_HEIGHT_AT_START=1 ~/run_celestia.sh` + - run home: `/home/mikers/.celestia-app-mainnet-treedb-20260328072830` + - analyzer highlights: + - `selected->executed segments = 8 -> 8` + - `processed_stale_bytes = 1.58 GiB` + - end WAL: `3320308275` (improved vs prior total-bytes-trigger baselines around `3.7-3.9 GiB`) + - offline rewrite on same home: `3320304179 -> 2132071399` (`du -sb` post `2132075495`) + - interpretation: + - the threshold change improves total-bytes-triggered live rewrite coverage while preserving trigger semantics. + +- Follow-up experiments (fresh-plan burst + WAL-off pre-checkpoint gate) + - Added fresh-plan debt-drain burst policy for planned rewrite queues: + - `vlogGenerationRewriteFreshPlanDebtDrainMinSegments=4` + - `vlogGenerationRewriteFreshPlanDebtDrainMaxSegments=4` + - path: `TreeDB/caching/db.go` (`vlogGenerationRewriteMaxSegmentsForFreshPlan`) + - tests: `TestVlogGenerationRewriteMaxSegmentsForFreshPlan_*` + +- Capacity analyzer output improvement: + - `scripts/analyze_vlog_maintenance_capacity.py` now prints `pre_checkpoint` and `priority` in the maintenance skip-pressure line. + +- Root-cause check for no-rewrite outlier: + - run home: `/home/mikers/.celestia-app-mainnet-treedb-20260328082023` + - observed: + - analyzer: `rewrite runs=0`, end WAL `5522118526` + - skip counters (`debug_vars`): `maintenance.skip.before_first_checkpoint=11` + - interpretation: + - WAL-off pre-checkpoint gate can suppress all rewrite activity on some short runs. + +- Added experimental override for WAL-off pre-checkpoint rewrite: + - env: `TREEDB_ENABLE_VLOG_GENERATION_PRECHECKPOINT_REWRITE=1` + - default remains disabled. + - gate change in `TreeDB/caching/db.go` allows bypassing `maintenance.skip.before_first_checkpoint` when env is set. + - docs updated: `docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md`. + - tests added: + - `TestVlogGenerationMaintenance_WALOffPreCheckpointSkipsRewriteByDefault` + - `TestVlogGenerationMaintenance_WALOffPreCheckpointCanRunWithEnvOverride` + +- Validation runs + - Baseline-like run (no override), home `/home/mikers/.celestia-app-mainnet-treedb-20260328075104`: + - end WAL: `3438411416` + - analyzer: `rewrite runs=8`, `selected->executed=8->8`, `processed_stale=1.59 GiB` + - offline rewrite: `3438407320 -> 2171030759` (post `du -sb`: `2171034855`) + - No-rewrite outlier (no override), home `/home/mikers/.celestia-app-mainnet-treedb-20260328082023`: + - end WAL: `5522118526` + - analyzer: `rewrite runs=0`, `pre_checkpoint skip dominated` + - offline rewrite: `5522114430 -> 2205781521` (post `du -sb`: `2205785617`) + - Override run (`TREEDB_ENABLE_VLOG_GENERATION_PRECHECKPOINT_REWRITE=1`), home `/home/mikers/.celestia-app-mainnet-treedb-20260328083336`: + - end WAL: `3477220043` + - analyzer: `pre_checkpoint=0`, `rewrite runs=8`, `selected->executed=9->8`, `processed_stale=1.59 GiB` + - offline rewrite: `3477215947 -> 2238622807` (post `du -sb`: `2238626903`) + +- Takeaway: + - pre-checkpoint gating is a first-order driver of run-to-run variance in live rewrite coverage under WAL-off fast runs. + - enabling the pre-checkpoint override avoids the catastrophic `rewrite runs=0` failure mode and restores expected live rewrite activity. + +- Additional live sweep (focus: robust lower end-of-run WAL under `fast` + pre-checkpoint rewrite): + - fixed env baseline: + - `TREEDB_OPEN_PROFILE=fast` + - `TREEDB_ENABLE_VLOG_GENERATION_PRECHECKPOINT_REWRITE=1` + - `TREEDB_VLOG_REWRITE_BUDGET_BYTES_PER_SEC=1073741824` + - `FREEZE_REMOTE_HEIGHT_AT_START=1` + - no total-bytes backstop (outlier repro): + - command: + - `... TREEDB_VLOG_REWRITE_TRIGGER_TOTAL_BYTES=0 TREEDB_VLOG_REWRITE_TRIGGER_STALE_RATIO_PPM=100000 ...` + - run home: `/home/mikers/.celestia-app-mainnet-treedb-20260328094006` + - analyzer highlights: + - `rewrite runs=1` + - `selected->executed=3->2` + - `processed_stale=475.49 MiB` + - `skip stage_gate/stage_not_due=7/7` + - end WAL: `4274361669` + - offline rewrite: `4274357573 -> 2093567828` (post `du -sb`: `2093571924`, `gzip -1`: `1749325383`) + - add total-bytes backstop @ `128 MiB`, stale ratio `100k`: + - run homes: + - `/home/mikers/.celestia-app-mainnet-treedb-20260328094856` + - `/home/mikers/.celestia-app-mainnet-treedb-20260328100118` (replicate) + - analyzer highlights: + - run1: `rewrite runs=7`, `selected->executed=7->7`, `processed_stale=1.44 GiB`, `end WAL=3362578071` + - run2: `rewrite runs=6`, `selected->executed=6->6`, `processed_stale=1.27 GiB`, `end WAL=3574791009` + - offline rewrite: + - run1: `3362573975 -> 2116702484` (post `du -sb`: `2116706580`, `gzip -1`: `1767440551`) + - run2: `3574786913 -> 2132053768` (post `du -sb`: `2132057864`, `gzip -1`: `1778930169`) + - total-bytes backstop @ `64 MiB`, stale ratio sweep: + - stale `100k` (`/home/mikers/.celestia-app-mainnet-treedb-20260328101255`): + - `rewrite runs=8`, `selected->executed=8->8`, `processed_stale=1.60 GiB`, `end WAL=3391412031` + - rewrite `3391407935 -> 2156175550` (post `du -sb`: `2156179646`, `gzip -1`: `1793519331`) + - stale `50k` (`/home/mikers/.celestia-app-mainnet-treedb-20260328102614`): + - `rewrite runs=7`, `selected->executed=7->7`, `processed_stale=1.40 GiB`, `end WAL=3569727005` + - rewrite `3569722909 -> 2175068990` (post `du -sb`: `2175073086`, `gzip -1`: `1806440477`) + - stale `10k` (`/home/mikers/.celestia-app-mainnet-treedb-20260328103947`): + - `rewrite runs=9`, `selected->executed=9->9`, `processed_stale=1.77 GiB`, `end WAL=3588198674` + - rewrite `3588194578 -> 2188079023` (post `du -sb`: `2188083119`, `gzip -1`: `1817157727`) + - interpretation: + - adding a nonzero `trigger_total_bytes` backstop prevents the catastrophic low-coverage outlier seen with stale-ratio-only triggering. + - in this window, pushing stale-ratio lower (`100k -> 50k -> 10k`) increases rewrite volume but does **not** improve end-of-run or post-rewrite bytes; it trends worse, consistent with extra rewrite churn without live reclaim. + - best observed point in this sweep: `trigger_total_bytes=128MiB`, `stale_ratio_ppm=100000` (lowest end WAL and best post-rewrite/gzip among these runs). + +- Capacity analyzer output improvement (follow-up): + - `scripts/analyze_vlog_maintenance_capacity.py` now prints: + - `plan-empty breakdown: no_selection / age_blocked` + - `plan penalty-filter: runs / segments / to_empty_runs` + - this helps distinguish threshold-limited empty plans (`no_selection`) from penalty/cooldown suppression. + +- Observability extension for observed-source GC protection breakdown: + - Added cumulative stats counters in `TreeDB/caching/db.go`: + - `treedb.cache.vlog_generation.observed_gc.source_segments_protected_in_use_total` + - `treedb.cache.vlog_generation.observed_gc.source_segments_protected_retained_total` + - `treedb.cache.vlog_generation.observed_gc.source_segments_protected_overlap_total` + - `treedb.cache.vlog_generation.observed_gc.source_segments_protected_other_total` + - `treedb.cache.vlog_generation.observed_gc.source_bytes_protected_in_use_total` + - `treedb.cache.vlog_generation.observed_gc.source_bytes_protected_retained_total` + - `treedb.cache.vlog_generation.observed_gc.source_bytes_protected_overlap_total` + - `treedb.cache.vlog_generation.observed_gc.source_bytes_protected_other_total` + - Extended `scripts/analyze_vlog_maintenance_capacity.py` to report observed-source protection mix (segments + bytes + percentages). + - Updated stats test coverage: + - `TreeDB/caching/vlog_generation_scheduler_test.go` (`TestVlogGenerationStats_ReportRewriteBacklogAndDurations`). + +- Validation run using new protection-mix counters (best current config): + - command: + - `LOCAL_GOMAP_DIR=/home/mikers/dev/snissn/gomap-phasehook-active TREEDB_OPEN_PROFILE=fast TREEDB_ENABLE_VLOG_GENERATION_PRECHECKPOINT_REWRITE=1 TREEDB_VLOG_REWRITE_BUDGET_BYTES_PER_SEC=1073741824 TREEDB_VLOG_REWRITE_TRIGGER_TOTAL_BYTES=134217728 TREEDB_VLOG_REWRITE_TRIGGER_STALE_RATIO_PPM=100000 FREEZE_REMOTE_HEIGHT_AT_START=1 ~/run_celestia.sh` + - run home: + - `/home/mikers/.celestia-app-mainnet-treedb-20260328110211` + - analyzer highlights: + - `rewrite runs=8`, `selected->executed=9->8`, `processed_stale=1.60 GiB` + - `plan-empty breakdown: no_selection=6 age_blocked=5` + - observed-source protection mix: + - segments: `in_use=0 retained=23 overlap=0 other=0` + - bytes: `in_use=0 B retained=5.75 GiB overlap=0 B other=0 B` + - size: + - end WAL: `3639153423` + - offline rewrite: `3639149327 -> 2230505477` (post `du -sb`: `2230509573`, `gzip -1`: `1848452954`) + - interpretation: + - in this run, observed-source protection is entirely `retained` (not `in_use` or overlap), confirming retained-lifecycle protection as the dominant in-run reclaim blocker.