-
Notifications
You must be signed in to change notification settings - Fork 7
Add buildctl du diagnostics around prune #75
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,18 +12,13 @@ import { exec } from "child_process"; | |
|
|
||
| import * as stateHelper from "./state-helper"; | ||
| import * as reporter from "./reporter"; | ||
| import { | ||
| execWithTimeout, | ||
| ExecTimeoutError, | ||
| BOLT_CHECK_MEMORY_MAX_BYTES, | ||
| BOLT_CHECK_MAX_FILE_BYTES, | ||
| } from "./exec-utils"; | ||
| import { | ||
| setupStickyDisk, | ||
| startAndConfigureBuildkitd, | ||
| getNumCPUs, | ||
| pruneBuildkitCache, | ||
| logDatabaseHashes, | ||
| BUILDKIT_DAEMON_ADDR, | ||
| } from "./setup_builder"; | ||
| import { | ||
| installBuildKit, | ||
|
|
@@ -38,112 +33,6 @@ const DEFAULT_BUILDX_VERSION = "v0.23.0"; | |
| const mountPoint = "/var/lib/buildkit"; | ||
| const execAsync = promisify(exec); | ||
|
|
||
| async function getDeviceFromMount(mountPath: string): Promise<string | null> { | ||
| try { | ||
| const { stdout } = await execAsync(`findmnt -n -o SOURCE "${mountPath}"`); | ||
| const device = stdout.trim(); | ||
| if (device) { | ||
| // Log full mount info for debugging | ||
| try { | ||
| const { stdout: mountInfo } = await execAsync( | ||
| `findmnt -n -o SOURCE,FSTYPE,OPTIONS "${mountPath}"`, | ||
| ); | ||
| core.info(`Mount info for ${mountPath}: ${mountInfo.trim()}`); | ||
| } catch { | ||
| // Ignore if we can't get full mount info | ||
| } | ||
| return device; | ||
| } | ||
| } catch { | ||
| core.info(`findmnt failed for ${mountPath}, trying mount command`); | ||
| } | ||
|
|
||
| try { | ||
| const { stdout } = await execAsync(`mount | grep " ${mountPath} "`); | ||
| const match = stdout.match(/^(\/dev\/\S+)/); | ||
| if (match) { | ||
| core.info(`Mount info for ${mountPath}: ${stdout.trim()}`); | ||
| return match[1]; | ||
| } | ||
| } catch { | ||
| core.info(`mount grep failed for ${mountPath}`); | ||
| } | ||
|
|
||
| return null; | ||
| } | ||
|
|
||
| const FLUSH_TIMEOUT_SECS = 10; | ||
| const TIMEOUT_EXIT_CODE = 124; | ||
|
|
||
| async function flushBlockDevice(devicePath: string): Promise<void> { | ||
| const deviceName = devicePath.replace("/dev/", ""); | ||
| if (!deviceName) { | ||
| core.warning(`Could not extract device name from ${devicePath}`); | ||
| return; | ||
| } | ||
|
|
||
| const statPath = `/sys/block/${deviceName}/stat`; | ||
|
|
||
| let beforeStats = ""; | ||
| try { | ||
| const { stdout } = await execAsync(`cat ${statPath}`); | ||
| beforeStats = stdout.trim(); | ||
| } catch { | ||
| core.warning(`Could not read block device stats before flush: ${statPath}`); | ||
| } | ||
|
|
||
| const startTime = Date.now(); | ||
| try { | ||
| const { stdout, stderr } = await execAsync( | ||
| `timeout ${FLUSH_TIMEOUT_SECS} sudo blockdev --flushbufs ${devicePath}; echo "EXIT_CODE:$?"`, | ||
| ); | ||
| const duration = Date.now() - startTime; | ||
|
|
||
| // Parse exit code from output | ||
| const exitCodeMatch = stdout.match(/EXIT_CODE:(\d+)/); | ||
| const exitCode = exitCodeMatch ? parseInt(exitCodeMatch[1], 10) : 0; | ||
|
|
||
| if (exitCode === TIMEOUT_EXIT_CODE) { | ||
| core.warning( | ||
| `guest flush timed out for ${devicePath} after ${FLUSH_TIMEOUT_SECS}s`, | ||
| ); | ||
| return; | ||
| } | ||
|
|
||
| if (exitCode !== 0) { | ||
| core.warning( | ||
| `guest flush failed for ${devicePath} after ${duration}ms: exit code ${exitCode}, stderr: ${stderr}`, | ||
| ); | ||
| return; | ||
| } | ||
|
|
||
| // Log stderr as warning even on success, in case there's useful diagnostic info | ||
| if (stderr && stderr.trim()) { | ||
| core.warning(`guest flush stderr (exit 0): ${stderr.trim()}`); | ||
| } | ||
|
|
||
| let afterStats = ""; | ||
| try { | ||
| const { stdout } = await execAsync(`cat ${statPath}`); | ||
| afterStats = stdout.trim(); | ||
| } catch { | ||
| core.warning( | ||
| `Could not read block device stats after flush: ${statPath}`, | ||
| ); | ||
| } | ||
|
|
||
| core.info( | ||
| `guest flush duration: ${duration}ms, device: ${devicePath}, before_stats: ${beforeStats}, after_stats: ${afterStats}`, | ||
| ); | ||
| } catch (error) { | ||
| const duration = Date.now() - startTime; | ||
| const errorMsg = error instanceof Error ? error.message : String(error); | ||
| core.warning( | ||
| `guest flush failed for ${devicePath} after ${duration}ms: ${errorMsg}`, | ||
| ); | ||
| } | ||
| } | ||
|
|
||
| async function checkBoltDbIntegrity(skip = false): Promise<boolean> { | ||
| if (skip) { | ||
| core.info( | ||
|
|
@@ -155,20 +44,14 @@ async function checkBoltDbIntegrity(skip = false): Promise<boolean> { | |
| try { | ||
| // Check if /var/lib/buildkit directory exists | ||
| try { | ||
| await execWithTimeout( | ||
| "test -d /var/lib/buildkit", | ||
| 15_000, | ||
| "test buildkit dir exists", | ||
| ); | ||
| await execAsync("test -d /var/lib/buildkit"); | ||
| core.debug( | ||
| "Found /var/lib/buildkit directory, checking for database files", | ||
| ); | ||
|
|
||
| // Find all *.db files in /var/lib/buildkit | ||
| const { stdout: dbFiles } = await execWithTimeout( | ||
| const { stdout: dbFiles } = await execAsync( | ||
| "find /var/lib/buildkit -name '*.db' 2>/dev/null || true", | ||
| 30_000, | ||
| "find db files", | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Documented max-parallelism input no longer appliedMedium Severity The public Additional Locations (1) |
||
| ); | ||
|
|
||
| if (dbFiles.trim()) { | ||
|
|
@@ -183,14 +66,11 @@ async function checkBoltDbIntegrity(skip = false): Promise<boolean> { | |
| try { | ||
| // Get file size | ||
| let sizeInfo = ""; | ||
| let sizeBytes = 0; | ||
| try { | ||
| const { stdout: sizeOutput } = await execWithTimeout( | ||
| const { stdout: sizeOutput } = await execAsync( | ||
| `stat -c%s "${dbFile}" 2>/dev/null || stat -f%z "${dbFile}"`, | ||
| 15_000, | ||
| `stat db file ${dbFile}`, | ||
| ); | ||
| sizeBytes = parseInt(sizeOutput.trim(), 10); | ||
| const sizeBytes = parseInt(sizeOutput.trim(), 10); | ||
| if (!isNaN(sizeBytes) && sizeBytes > 0) { | ||
| const sizeMB = (sizeBytes / (1024 * 1024)).toFixed(2); | ||
| sizeInfo = ` (${sizeMB} MB)`; | ||
|
|
@@ -201,26 +81,12 @@ async function checkBoltDbIntegrity(skip = false): Promise<boolean> { | |
| ); | ||
| } | ||
|
|
||
| // Skip integrity check for files that are too large for the memory-limited | ||
| // systemd scope. bbolt check mmaps the entire file, and with ~50-60 MB of | ||
| // Go runtime overhead the process will be OOM-killed for large files. | ||
| if (sizeBytes > BOLT_CHECK_MAX_FILE_BYTES) { | ||
| const sizeMB = (sizeBytes / (1024 * 1024)).toFixed(2); | ||
| core.info( | ||
| `${dbFile}: Skipping integrity check - file size ${sizeMB} MB exceeds limit (${BOLT_CHECK_MAX_FILE_BYTES / (1024 * 1024)} MB)`, | ||
| ); | ||
| continue; | ||
| } | ||
|
|
||
| core.info(`Running bolt check on ${dbFile}${sizeInfo}...`); | ||
| const startTime = Date.now(); | ||
|
|
||
| try { | ||
| const memoryMaxMB = BOLT_CHECK_MEMORY_MAX_BYTES / (1024 * 1024); | ||
| const { stdout: checkResult } = await execWithTimeout( | ||
| `sudo systemd-run --scope --quiet -p MemoryMax=${memoryMaxMB}M -p RuntimeMaxSec=6s bbolt check "${dbFile}" 2>&1`, | ||
| 30_000, | ||
| `bbolt check ${dbFile}`, | ||
| const { stdout: checkResult } = await execAsync( | ||
| `sudo systemd-run --scope --quiet -p MemoryMax=512M -p RuntimeMaxSec=6s bbolt check "${dbFile}" 2>&1`, | ||
| ); | ||
| const duration = Date.now() - startTime; | ||
| const durationSeconds = (duration / 1000).toFixed(2); | ||
|
|
@@ -245,23 +111,18 @@ async function checkBoltDbIntegrity(skip = false): Promise<boolean> { | |
| const exitCode = (checkError as { code?: number }).code; | ||
| const errorMessage = (checkError as Error).message; | ||
|
|
||
| // ExecTimeoutError = Promise.race timeout (process stuck in D state, e.g. Ceph partition) | ||
| if (checkError instanceof ExecTimeoutError) { | ||
| core.warning( | ||
| `⚠ ${dbFile}: Integrity check hit hard timeout after ${durationSeconds}s (possible I/O stall) - skipping`, | ||
| ); | ||
| // Exit code 124 = timeout, 137 = SIGKILL (likely OOM), 143 = SIGTERM | ||
| } else if (exitCode === 124) { | ||
| // Exit code 124 = timeout, 137 = SIGKILL (likely OOM), 143 = SIGTERM | ||
| if (exitCode === 124) { | ||
| core.warning( | ||
| `⚠ ${dbFile}: Integrity check timed out after ${durationSeconds}s - skipping`, | ||
| `⚠ ${dbFile}: Integrity check timed out after ${durationSeconds}s - skipping (not counted as failure)`, | ||
| ); | ||
| } else if ( | ||
| exitCode === 137 || | ||
| errorMessage.toLowerCase().includes("out of memory") || | ||
| errorMessage.toLowerCase().includes("cannot allocate memory") | ||
| ) { | ||
| core.warning( | ||
| `⚠ ${dbFile}: Integrity check hit memory limit - skipping`, | ||
| `⚠ ${dbFile}: Integrity check hit memory limit - skipping (not counted as failure)`, | ||
| ); | ||
| } else { | ||
| core.warning( | ||
|
|
@@ -286,12 +147,6 @@ async function checkBoltDbIntegrity(skip = false): Promise<boolean> { | |
| return true; | ||
| } | ||
| } catch (error) { | ||
| if (error instanceof ExecTimeoutError) { | ||
| core.warning( | ||
| `Integrity check hit hard timeout during filesystem access (possible I/O stall) - skipping`, | ||
| ); | ||
| return true; | ||
| } | ||
| core.info( | ||
| `/var/lib/buildkit directory not found, skipping database checks ${(error as Error).message}`, | ||
| ); | ||
|
|
@@ -312,23 +167,9 @@ export interface Inputs { | |
| "github-token": string; | ||
| "skip-integrity-check": boolean; | ||
| "driver-opts": string[]; | ||
| "max-parallelism": number | null; | ||
| } | ||
|
|
||
| async function getInputs(): Promise<Inputs> { | ||
| const maxParallelismInput = core.getInput("max-parallelism"); | ||
| let maxParallelism: number | null = null; | ||
| if (maxParallelismInput) { | ||
| const parsed = parseInt(maxParallelismInput, 10); | ||
| if (!isNaN(parsed) && parsed > 0) { | ||
| maxParallelism = parsed; | ||
| } else { | ||
| core.warning( | ||
| `Invalid max-parallelism value '${maxParallelismInput}', ignoring. Must be a positive integer.`, | ||
| ); | ||
| } | ||
| } | ||
|
|
||
| return { | ||
| "buildx-version": core.getInput("buildx-version"), | ||
| "buildkit-version": core.getInput("buildkit-version"), | ||
|
|
@@ -340,7 +181,6 @@ async function getInputs(): Promise<Inputs> { | |
| ignoreComma: true, | ||
| quote: false, | ||
| }), | ||
| "max-parallelism": maxParallelism, | ||
| }; | ||
| } | ||
|
|
||
|
|
@@ -455,14 +295,8 @@ async function startBlacksmithBuilder( | |
| } | ||
| } | ||
|
|
||
| // Get CPU count for parallelism, allow user override via max-parallelism input | ||
| let parallelism = await getNumCPUs(); | ||
| if (inputs["max-parallelism"] !== null) { | ||
| core.info( | ||
| `Overriding max-parallelism from ${parallelism} (nproc) to ${inputs["max-parallelism"]} (user-specified)`, | ||
| ); | ||
| parallelism = inputs["max-parallelism"]; | ||
| } | ||
| // Get CPU count for parallelism | ||
| const parallelism = await getNumCPUs(); | ||
|
|
||
| // Check if buildkitd is already running before starting | ||
| try { | ||
|
|
@@ -688,9 +522,43 @@ void actionsToolkit.run( | |
|
|
||
| // Optional: Prune cache before shutdown (non-critical) | ||
| try { | ||
| // Capture cache state BEFORE prune for diagnostics | ||
| try { | ||
| core.info("=== BuildKit cache BEFORE prune ==="); | ||
| const { stdout: duBeforeVerbose } = await execAsync( | ||
| `sudo buildctl --addr ${BUILDKIT_DAEMON_ADDR} du --verbose 2>&1 | tail -200`, | ||
| ); | ||
| core.info(duBeforeVerbose); | ||
| const { stdout: duBeforeSummary } = await execAsync( | ||
| `sudo buildctl --addr ${BUILDKIT_DAEMON_ADDR} du 2>&1 | tail -5`, | ||
| ); | ||
| core.info(`Cache summary before prune: ${duBeforeSummary}`); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. New buildctl du diagnostics can hang cleanupMedium Severity The new pre/post-prune Additional Locations (1) |
||
| } catch (e) { | ||
| core.warning( | ||
| `Could not get pre-prune du: ${(e as Error).message}`, | ||
| ); | ||
| } | ||
|
|
||
| core.info("Pruning BuildKit cache"); | ||
| await pruneBuildkitCache(); | ||
| core.info("BuildKit cache pruned"); | ||
|
|
||
| // Capture cache state AFTER prune for diagnostics | ||
| try { | ||
| core.info("=== BuildKit cache AFTER prune ==="); | ||
| const { stdout: duAfterVerbose } = await execAsync( | ||
| `sudo buildctl --addr ${BUILDKIT_DAEMON_ADDR} du --verbose 2>&1 | tail -200`, | ||
| ); | ||
| core.info(duAfterVerbose); | ||
| const { stdout: duAfterSummary } = await execAsync( | ||
| `sudo buildctl --addr ${BUILDKIT_DAEMON_ADDR} du 2>&1 | tail -5`, | ||
| ); | ||
| core.info(`Cache summary after prune: ${duAfterSummary}`); | ||
| } catch (e) { | ||
| core.warning( | ||
| `Could not get post-prune du: ${(e as Error).message}`, | ||
| ); | ||
| } | ||
| } catch (error) { | ||
| core.warning( | ||
| `Error pruning BuildKit cache: ${(error as Error).message}`, | ||
|
|
@@ -771,19 +639,6 @@ void actionsToolkit.run( | |
| // Step 2: Sync and unmount sticky disk | ||
| await execAsync("sync"); | ||
|
|
||
| // Get device path before unmount for durability flush | ||
| let devicePath: string | null = null; | ||
| try { | ||
| devicePath = await getDeviceFromMount(mountPoint); | ||
| if (devicePath) { | ||
| core.info( | ||
| `Found device ${devicePath} for mount point ${mountPoint}`, | ||
| ); | ||
| } | ||
| } catch { | ||
| core.info(`Could not determine device for ${mountPoint}`); | ||
| } | ||
|
|
||
| try { | ||
| const { stdout: mountOutput } = await execAsync( | ||
| `mount | grep "${mountPoint}"`, | ||
|
|
@@ -846,16 +701,6 @@ void actionsToolkit.run( | |
| await new Promise((resolve) => setTimeout(resolve, 100)); | ||
| } | ||
| } | ||
|
|
||
| // Flush block device buffers after unmount to ensure data durability | ||
| // before the Ceph RBD snapshot is taken. The device is still mapped even though unmounted. | ||
| if (devicePath) { | ||
| await flushBlockDevice(devicePath); | ||
| } else { | ||
| core.info( | ||
| "Skipping durability flush: device path not found for mount point", | ||
| ); | ||
| } | ||
| } else { | ||
| core.debug("No sticky disk mount found"); | ||
| } | ||
|
|
||


There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Integrity checks lost hard timeout protection
High Severity
checkBoltDbIntegritynow uses bareexecAsyncfor filesystem andbboltcommands, so stalled I/O can block forever. The previousexecWithTimeout/ExecTimeoutErrorpath that allowed skipping on hangs was removed, which can freeze post-action cleanup.Additional Locations (1)
src/main.ts#L149-L153