diff --git a/src/main.ts b/src/main.ts index 8e500e9..3690fbc 100644 --- a/src/main.ts +++ b/src/main.ts @@ -12,18 +12,13 @@ import { exec } from "child_process"; import * as stateHelper from "./state-helper"; import * as reporter from "./reporter"; -import { - execWithTimeout, - ExecTimeoutError, - BOLT_CHECK_MEMORY_MAX_BYTES, - BOLT_CHECK_MAX_FILE_BYTES, -} from "./exec-utils"; import { setupStickyDisk, startAndConfigureBuildkitd, getNumCPUs, pruneBuildkitCache, logDatabaseHashes, + BUILDKIT_DAEMON_ADDR, } from "./setup_builder"; import { installBuildKit, @@ -38,112 +33,6 @@ const DEFAULT_BUILDX_VERSION = "v0.23.0"; const mountPoint = "/var/lib/buildkit"; const execAsync = promisify(exec); -async function getDeviceFromMount(mountPath: string): Promise { - try { - const { stdout } = await execAsync(`findmnt -n -o SOURCE "${mountPath}"`); - const device = stdout.trim(); - if (device) { - // Log full mount info for debugging - try { - const { stdout: mountInfo } = await execAsync( - `findmnt -n -o SOURCE,FSTYPE,OPTIONS "${mountPath}"`, - ); - core.info(`Mount info for ${mountPath}: ${mountInfo.trim()}`); - } catch { - // Ignore if we can't get full mount info - } - return device; - } - } catch { - core.info(`findmnt failed for ${mountPath}, trying mount command`); - } - - try { - const { stdout } = await execAsync(`mount | grep " ${mountPath} "`); - const match = stdout.match(/^(\/dev\/\S+)/); - if (match) { - core.info(`Mount info for ${mountPath}: ${stdout.trim()}`); - return match[1]; - } - } catch { - core.info(`mount grep failed for ${mountPath}`); - } - - return null; -} - -const FLUSH_TIMEOUT_SECS = 10; -const TIMEOUT_EXIT_CODE = 124; - -async function flushBlockDevice(devicePath: string): Promise { - const deviceName = devicePath.replace("/dev/", ""); - if (!deviceName) { - core.warning(`Could not extract device name from ${devicePath}`); - return; - } - - const statPath = `/sys/block/${deviceName}/stat`; - - let beforeStats = ""; - try { - const { stdout } = await execAsync(`cat ${statPath}`); - beforeStats = stdout.trim(); - } catch { - core.warning(`Could not read block device stats before flush: ${statPath}`); - } - - const startTime = Date.now(); - try { - const { stdout, stderr } = await execAsync( - `timeout ${FLUSH_TIMEOUT_SECS} sudo blockdev --flushbufs ${devicePath}; echo "EXIT_CODE:$?"`, - ); - const duration = Date.now() - startTime; - - // Parse exit code from output - const exitCodeMatch = stdout.match(/EXIT_CODE:(\d+)/); - const exitCode = exitCodeMatch ? parseInt(exitCodeMatch[1], 10) : 0; - - if (exitCode === TIMEOUT_EXIT_CODE) { - core.warning( - `guest flush timed out for ${devicePath} after ${FLUSH_TIMEOUT_SECS}s`, - ); - return; - } - - if (exitCode !== 0) { - core.warning( - `guest flush failed for ${devicePath} after ${duration}ms: exit code ${exitCode}, stderr: ${stderr}`, - ); - return; - } - - // Log stderr as warning even on success, in case there's useful diagnostic info - if (stderr && stderr.trim()) { - core.warning(`guest flush stderr (exit 0): ${stderr.trim()}`); - } - - let afterStats = ""; - try { - const { stdout } = await execAsync(`cat ${statPath}`); - afterStats = stdout.trim(); - } catch { - core.warning( - `Could not read block device stats after flush: ${statPath}`, - ); - } - - core.info( - `guest flush duration: ${duration}ms, device: ${devicePath}, before_stats: ${beforeStats}, after_stats: ${afterStats}`, - ); - } catch (error) { - const duration = Date.now() - startTime; - const errorMsg = error instanceof Error ? error.message : String(error); - core.warning( - `guest flush failed for ${devicePath} after ${duration}ms: ${errorMsg}`, - ); - } -} - async function checkBoltDbIntegrity(skip = false): Promise { if (skip) { core.info( @@ -155,20 +44,14 @@ async function checkBoltDbIntegrity(skip = false): Promise { try { // Check if /var/lib/buildkit directory exists try { - await execWithTimeout( - "test -d /var/lib/buildkit", - 15_000, - "test buildkit dir exists", - ); + await execAsync("test -d /var/lib/buildkit"); core.debug( "Found /var/lib/buildkit directory, checking for database files", ); // Find all *.db files in /var/lib/buildkit - const { stdout: dbFiles } = await execWithTimeout( + const { stdout: dbFiles } = await execAsync( "find /var/lib/buildkit -name '*.db' 2>/dev/null || true", - 30_000, - "find db files", ); if (dbFiles.trim()) { @@ -183,14 +66,11 @@ async function checkBoltDbIntegrity(skip = false): Promise { try { // Get file size let sizeInfo = ""; - let sizeBytes = 0; try { - const { stdout: sizeOutput } = await execWithTimeout( + const { stdout: sizeOutput } = await execAsync( `stat -c%s "${dbFile}" 2>/dev/null || stat -f%z "${dbFile}"`, - 15_000, - `stat db file ${dbFile}`, ); - sizeBytes = parseInt(sizeOutput.trim(), 10); + const sizeBytes = parseInt(sizeOutput.trim(), 10); if (!isNaN(sizeBytes) && sizeBytes > 0) { const sizeMB = (sizeBytes / (1024 * 1024)).toFixed(2); sizeInfo = ` (${sizeMB} MB)`; @@ -201,26 +81,12 @@ async function checkBoltDbIntegrity(skip = false): Promise { ); } - // Skip integrity check for files that are too large for the memory-limited - // systemd scope. bbolt check mmaps the entire file, and with ~50-60 MB of - // Go runtime overhead the process will be OOM-killed for large files. - if (sizeBytes > BOLT_CHECK_MAX_FILE_BYTES) { - const sizeMB = (sizeBytes / (1024 * 1024)).toFixed(2); - core.info( - `${dbFile}: Skipping integrity check - file size ${sizeMB} MB exceeds limit (${BOLT_CHECK_MAX_FILE_BYTES / (1024 * 1024)} MB)`, - ); - continue; - } - core.info(`Running bolt check on ${dbFile}${sizeInfo}...`); const startTime = Date.now(); try { - const memoryMaxMB = BOLT_CHECK_MEMORY_MAX_BYTES / (1024 * 1024); - const { stdout: checkResult } = await execWithTimeout( - `sudo systemd-run --scope --quiet -p MemoryMax=${memoryMaxMB}M -p RuntimeMaxSec=6s bbolt check "${dbFile}" 2>&1`, - 30_000, - `bbolt check ${dbFile}`, + const { stdout: checkResult } = await execAsync( + `sudo systemd-run --scope --quiet -p MemoryMax=512M -p RuntimeMaxSec=6s bbolt check "${dbFile}" 2>&1`, ); const duration = Date.now() - startTime; const durationSeconds = (duration / 1000).toFixed(2); @@ -245,15 +111,10 @@ async function checkBoltDbIntegrity(skip = false): Promise { const exitCode = (checkError as { code?: number }).code; const errorMessage = (checkError as Error).message; - // ExecTimeoutError = Promise.race timeout (process stuck in D state, e.g. Ceph partition) - if (checkError instanceof ExecTimeoutError) { - core.warning( - `⚠ ${dbFile}: Integrity check hit hard timeout after ${durationSeconds}s (possible I/O stall) - skipping`, - ); - // Exit code 124 = timeout, 137 = SIGKILL (likely OOM), 143 = SIGTERM - } else if (exitCode === 124) { + // Exit code 124 = timeout, 137 = SIGKILL (likely OOM), 143 = SIGTERM + if (exitCode === 124) { core.warning( - `⚠ ${dbFile}: Integrity check timed out after ${durationSeconds}s - skipping`, + `⚠ ${dbFile}: Integrity check timed out after ${durationSeconds}s - skipping (not counted as failure)`, ); } else if ( exitCode === 137 || @@ -261,7 +122,7 @@ async function checkBoltDbIntegrity(skip = false): Promise { errorMessage.toLowerCase().includes("cannot allocate memory") ) { core.warning( - `⚠ ${dbFile}: Integrity check hit memory limit - skipping`, + `⚠ ${dbFile}: Integrity check hit memory limit - skipping (not counted as failure)`, ); } else { core.warning( @@ -286,12 +147,6 @@ async function checkBoltDbIntegrity(skip = false): Promise { return true; } } catch (error) { - if (error instanceof ExecTimeoutError) { - core.warning( - `Integrity check hit hard timeout during filesystem access (possible I/O stall) - skipping`, - ); - return true; - } core.info( `/var/lib/buildkit directory not found, skipping database checks ${(error as Error).message}`, ); @@ -312,23 +167,9 @@ export interface Inputs { "github-token": string; "skip-integrity-check": boolean; "driver-opts": string[]; - "max-parallelism": number | null; } async function getInputs(): Promise { - const maxParallelismInput = core.getInput("max-parallelism"); - let maxParallelism: number | null = null; - if (maxParallelismInput) { - const parsed = parseInt(maxParallelismInput, 10); - if (!isNaN(parsed) && parsed > 0) { - maxParallelism = parsed; - } else { - core.warning( - `Invalid max-parallelism value '${maxParallelismInput}', ignoring. Must be a positive integer.`, - ); - } - } - return { "buildx-version": core.getInput("buildx-version"), "buildkit-version": core.getInput("buildkit-version"), @@ -340,7 +181,6 @@ async function getInputs(): Promise { ignoreComma: true, quote: false, }), - "max-parallelism": maxParallelism, }; } @@ -455,14 +295,8 @@ async function startBlacksmithBuilder( } } - // Get CPU count for parallelism, allow user override via max-parallelism input - let parallelism = await getNumCPUs(); - if (inputs["max-parallelism"] !== null) { - core.info( - `Overriding max-parallelism from ${parallelism} (nproc) to ${inputs["max-parallelism"]} (user-specified)`, - ); - parallelism = inputs["max-parallelism"]; - } + // Get CPU count for parallelism + const parallelism = await getNumCPUs(); // Check if buildkitd is already running before starting try { @@ -688,9 +522,43 @@ void actionsToolkit.run( // Optional: Prune cache before shutdown (non-critical) try { + // Capture cache state BEFORE prune for diagnostics + try { + core.info("=== BuildKit cache BEFORE prune ==="); + const { stdout: duBeforeVerbose } = await execAsync( + `sudo buildctl --addr ${BUILDKIT_DAEMON_ADDR} du --verbose 2>&1 | tail -200`, + ); + core.info(duBeforeVerbose); + const { stdout: duBeforeSummary } = await execAsync( + `sudo buildctl --addr ${BUILDKIT_DAEMON_ADDR} du 2>&1 | tail -5`, + ); + core.info(`Cache summary before prune: ${duBeforeSummary}`); + } catch (e) { + core.warning( + `Could not get pre-prune du: ${(e as Error).message}`, + ); + } + core.info("Pruning BuildKit cache"); await pruneBuildkitCache(); core.info("BuildKit cache pruned"); + + // Capture cache state AFTER prune for diagnostics + try { + core.info("=== BuildKit cache AFTER prune ==="); + const { stdout: duAfterVerbose } = await execAsync( + `sudo buildctl --addr ${BUILDKIT_DAEMON_ADDR} du --verbose 2>&1 | tail -200`, + ); + core.info(duAfterVerbose); + const { stdout: duAfterSummary } = await execAsync( + `sudo buildctl --addr ${BUILDKIT_DAEMON_ADDR} du 2>&1 | tail -5`, + ); + core.info(`Cache summary after prune: ${duAfterSummary}`); + } catch (e) { + core.warning( + `Could not get post-prune du: ${(e as Error).message}`, + ); + } } catch (error) { core.warning( `Error pruning BuildKit cache: ${(error as Error).message}`, @@ -771,19 +639,6 @@ void actionsToolkit.run( // Step 2: Sync and unmount sticky disk await execAsync("sync"); - // Get device path before unmount for durability flush - let devicePath: string | null = null; - try { - devicePath = await getDeviceFromMount(mountPoint); - if (devicePath) { - core.info( - `Found device ${devicePath} for mount point ${mountPoint}`, - ); - } - } catch { - core.info(`Could not determine device for ${mountPoint}`); - } - try { const { stdout: mountOutput } = await execAsync( `mount | grep "${mountPoint}"`, @@ -846,16 +701,6 @@ void actionsToolkit.run( await new Promise((resolve) => setTimeout(resolve, 100)); } } - - // Flush block device buffers after unmount to ensure data durability - // before the Ceph RBD snapshot is taken. The device is still mapped even though unmounted. - if (devicePath) { - await flushBlockDevice(devicePath); - } else { - core.info( - "Skipping durability flush: device path not found for mount point", - ); - } } else { core.debug("No sticky disk mount found"); } diff --git a/src/setup_builder.ts b/src/setup_builder.ts index 2f35e44..1b168e4 100644 --- a/src/setup_builder.ts +++ b/src/setup_builder.ts @@ -6,10 +6,9 @@ import * as TOML from "@iarna/toml"; import * as reporter from "./reporter"; import { execa } from "execa"; import * as stateHelper from "./state-helper"; -import { BOLT_CHECK_MAX_FILE_BYTES } from "./exec-utils"; // Constants for configuration. -const BUILDKIT_DAEMON_ADDR = "tcp://127.0.0.1:1234"; +export const BUILDKIT_DAEMON_ADDR = "tcp://127.0.0.1:1234"; const mountPoint = "/var/lib/buildkit"; const execAsync = promisify(exec); @@ -29,10 +28,8 @@ async function maybeFormatBlockDevice(device: string): Promise { // Run resize2fs to ensure filesystem uses full block device await execAsync(`sudo resize2fs -f ${device}`); core.debug(`Resized ext4 filesystem on ${device}`); - } catch (resizeError) { - core.warning( - `Error resizing ext4 filesystem on ${device}: ${(resizeError as Error).message}`, - ); + } catch { + core.warning(`Error resizing ext4 filesystem on ${device}`); } return device; } @@ -68,89 +65,15 @@ export async function getNumCPUs(): Promise { } } -/** - * Configures systemd-resolved to listen on all interfaces (not just loopback) - * so that BuildKit build containers on bridge networks can reach the DNS cache. - * - * By default, systemd-resolved only listens on 127.0.0.53, which is not - * reachable from containers in their own network namespace. This adds a - * drop-in config to make it listen on 0.0.0.0:53. - * - * See: https://github.com/moby/buildkit/issues/5009 - */ -async function configureSystemdResolvedForBuildkit(): Promise { - try { - await execAsync(`sudo mkdir -p /etc/systemd/resolved.conf.d`); - await execAsync( - `echo '[Resolve]\nDNSStubListenerExtra=0.0.0.0' | sudo tee /etc/systemd/resolved.conf.d/buildkit-dns.conf`, - ); - await execAsync(`sudo systemctl restart systemd-resolved`); - core.info( - "Configured systemd-resolved to listen on all interfaces for BuildKit DNS caching", - ); - } catch (error) { - core.warning( - `Failed to configure systemd-resolved: ${(error as Error).message}`, - ); - } -} - -/** - * Gets the host's primary routable IP address, which is reachable from - * BuildKit build containers on any network mode (host, bridge, custom). - * - * Falls back to public DNS servers if the routable IP cannot be determined. - */ -async function getRoutableHostDns(): Promise { - // Public DNS fallback in case we can't determine the host's routable IP - const publicDnsFallback = ["8.8.8.8", "8.8.4.4", "1.1.1.1", "1.0.0.1"]; - - try { - // Get the host's source IP for internet-bound traffic - const { stdout } = await execAsync( - `ip route get 1.1.1.1 | grep -oP 'src \\K[0-9.]+'`, - ); - const hostIp = stdout.trim(); - - if (hostIp && hostIp !== "127.0.0.53") { - core.info( - `Using host routable IP ${hostIp} as sole DNS nameserver for BuildKit (systemd-resolved cache)`, - ); - // Only use the host IP (backed by systemd-resolved cache). - // Do NOT include public DNS fallbacks — BuildKit round-robins across - // all nameservers rather than using them as ordered fallbacks, which - // would bypass the cache for ~50% of queries and defeat the purpose. - // systemd-resolved itself already has upstream fallback configured. - return [hostIp]; - } - } catch (error) { - core.warning( - `Failed to determine host routable IP: ${(error as Error).message}`, - ); - } - - core.info("Falling back to public DNS nameservers (no local cache)"); - return publicDnsFallback; -} - async function writeBuildkitdTomlFile( parallelism: number, addr: string, - dnsNameservers: string[], ): Promise { const jsonConfig: TOML.JsonMap = { root: "/var/lib/buildkit", grpc: { address: [addr], }, - // Point BuildKit at the host's systemd-resolved cache via a routable IP. - // This avoids the known issue where BuildKit falls back to hardcoded public DNS - // (8.8.8.8/8.8.4.4) because it can't use the 127.0.0.53 stub resolver from - // containers in separate network namespaces. - // See: https://github.com/moby/buildkit/issues/5009 - dns: { - nameservers: dnsNameservers, - }, registry: { "docker.io": { mirrors: ["http://192.168.127.1:5000"], @@ -195,11 +118,7 @@ export async function startBuildkitd( driverOpts?: string[], ): Promise { try { - // Configure systemd-resolved to listen on a routable address so BuildKit - // build containers can use the host's DNS cache from any network namespace. - await configureSystemdResolvedForBuildkit(); - const dnsNameservers = await getRoutableHostDns(); - await writeBuildkitdTomlFile(parallelism, addr, dnsNameservers); + await writeBuildkitdTomlFile(parallelism, addr); // Parse driver-opts to extract environment variables const envVars: Record = {}; @@ -479,24 +398,6 @@ export async function logDatabaseHashes(label: string): Promise { for (const filePath of dbFiles) { try { - // Check file size before attempting hash — skip large files that would - // timeout or consume excessive I/O. - try { - const { stdout: sizeOutput } = await execAsync( - `stat -c%s "${filePath}" 2>/dev/null || stat -f%z "${filePath}"`, - ); - const sizeBytes = parseInt(sizeOutput.trim(), 10); - if (!isNaN(sizeBytes) && sizeBytes > BOLT_CHECK_MAX_FILE_BYTES) { - const sizeMB = (sizeBytes / (1024 * 1024)).toFixed(2); - core.info( - ` ${filePath}: skipping hash (${sizeMB} MB exceeds ${BOLT_CHECK_MAX_FILE_BYTES / (1024 * 1024)} MB limit)`, - ); - continue; - } - } catch { - // If stat fails, still attempt the hash — md5sum will fail with a clear error - } - // Use timeout and md5sum to offload computation, avoiding reading file in Node.js const { stdout } = await execAsync( `timeout 5s sudo md5sum "${filePath}"`,