Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
249 changes: 47 additions & 202 deletions src/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,13 @@ import { exec } from "child_process";

import * as stateHelper from "./state-helper";
import * as reporter from "./reporter";
import {
execWithTimeout,
ExecTimeoutError,
BOLT_CHECK_MEMORY_MAX_BYTES,
BOLT_CHECK_MAX_FILE_BYTES,
} from "./exec-utils";
import {
setupStickyDisk,
startAndConfigureBuildkitd,
getNumCPUs,
pruneBuildkitCache,
logDatabaseHashes,
BUILDKIT_DAEMON_ADDR,
} from "./setup_builder";
import {
installBuildKit,
Expand All @@ -38,112 +33,6 @@ const DEFAULT_BUILDX_VERSION = "v0.23.0";
const mountPoint = "/var/lib/buildkit";
const execAsync = promisify(exec);

async function getDeviceFromMount(mountPath: string): Promise<string | null> {
try {
const { stdout } = await execAsync(`findmnt -n -o SOURCE "${mountPath}"`);
const device = stdout.trim();
if (device) {
// Log full mount info for debugging
try {
const { stdout: mountInfo } = await execAsync(
`findmnt -n -o SOURCE,FSTYPE,OPTIONS "${mountPath}"`,
);
core.info(`Mount info for ${mountPath}: ${mountInfo.trim()}`);
} catch {
// Ignore if we can't get full mount info
}
return device;
}
} catch {
core.info(`findmnt failed for ${mountPath}, trying mount command`);
}

try {
const { stdout } = await execAsync(`mount | grep " ${mountPath} "`);
const match = stdout.match(/^(\/dev\/\S+)/);
if (match) {
core.info(`Mount info for ${mountPath}: ${stdout.trim()}`);
return match[1];
}
} catch {
core.info(`mount grep failed for ${mountPath}`);
}

return null;
}

const FLUSH_TIMEOUT_SECS = 10;
const TIMEOUT_EXIT_CODE = 124;

async function flushBlockDevice(devicePath: string): Promise<void> {
const deviceName = devicePath.replace("/dev/", "");
if (!deviceName) {
core.warning(`Could not extract device name from ${devicePath}`);
return;
}

const statPath = `/sys/block/${deviceName}/stat`;

let beforeStats = "";
try {
const { stdout } = await execAsync(`cat ${statPath}`);
beforeStats = stdout.trim();
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Integrity checks lost hard timeout protection

High Severity

checkBoltDbIntegrity now uses bare execAsync for filesystem and bbolt commands, so stalled I/O can block forever. The previous execWithTimeout/ExecTimeoutError path that allowed skipping on hangs was removed, which can freeze post-action cleanup.

Additional Locations (1)
Fix in Cursor Fix in Web

} catch {
core.warning(`Could not read block device stats before flush: ${statPath}`);
}

const startTime = Date.now();
try {
const { stdout, stderr } = await execAsync(
`timeout ${FLUSH_TIMEOUT_SECS} sudo blockdev --flushbufs ${devicePath}; echo "EXIT_CODE:$?"`,
);
const duration = Date.now() - startTime;

// Parse exit code from output
const exitCodeMatch = stdout.match(/EXIT_CODE:(\d+)/);
const exitCode = exitCodeMatch ? parseInt(exitCodeMatch[1], 10) : 0;

if (exitCode === TIMEOUT_EXIT_CODE) {
core.warning(
`guest flush timed out for ${devicePath} after ${FLUSH_TIMEOUT_SECS}s`,
);
return;
}

if (exitCode !== 0) {
core.warning(
`guest flush failed for ${devicePath} after ${duration}ms: exit code ${exitCode}, stderr: ${stderr}`,
);
return;
}

// Log stderr as warning even on success, in case there's useful diagnostic info
if (stderr && stderr.trim()) {
core.warning(`guest flush stderr (exit 0): ${stderr.trim()}`);
}

let afterStats = "";
try {
const { stdout } = await execAsync(`cat ${statPath}`);
afterStats = stdout.trim();
} catch {
core.warning(
`Could not read block device stats after flush: ${statPath}`,
);
}

core.info(
`guest flush duration: ${duration}ms, device: ${devicePath}, before_stats: ${beforeStats}, after_stats: ${afterStats}`,
);
} catch (error) {
const duration = Date.now() - startTime;
const errorMsg = error instanceof Error ? error.message : String(error);
core.warning(
`guest flush failed for ${devicePath} after ${duration}ms: ${errorMsg}`,
);
}
}

async function checkBoltDbIntegrity(skip = false): Promise<boolean> {
if (skip) {
core.info(
Expand All @@ -155,20 +44,14 @@ async function checkBoltDbIntegrity(skip = false): Promise<boolean> {
try {
// Check if /var/lib/buildkit directory exists
try {
await execWithTimeout(
"test -d /var/lib/buildkit",
15_000,
"test buildkit dir exists",
);
await execAsync("test -d /var/lib/buildkit");
core.debug(
"Found /var/lib/buildkit directory, checking for database files",
);

// Find all *.db files in /var/lib/buildkit
const { stdout: dbFiles } = await execWithTimeout(
const { stdout: dbFiles } = await execAsync(
"find /var/lib/buildkit -name '*.db' 2>/dev/null || true",
30_000,
"find db files",
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Documented max-parallelism input no longer applied

Medium Severity

The public max-parallelism input is still declared in action.yml, but parsing and override logic were removed. User-provided values are now silently ignored and parallelism is always derived from CPU count.

Additional Locations (1)
Fix in Cursor Fix in Web

);

if (dbFiles.trim()) {
Expand All @@ -183,14 +66,11 @@ async function checkBoltDbIntegrity(skip = false): Promise<boolean> {
try {
// Get file size
let sizeInfo = "";
let sizeBytes = 0;
try {
const { stdout: sizeOutput } = await execWithTimeout(
const { stdout: sizeOutput } = await execAsync(
`stat -c%s "${dbFile}" 2>/dev/null || stat -f%z "${dbFile}"`,
15_000,
`stat db file ${dbFile}`,
);
sizeBytes = parseInt(sizeOutput.trim(), 10);
const sizeBytes = parseInt(sizeOutput.trim(), 10);
if (!isNaN(sizeBytes) && sizeBytes > 0) {
const sizeMB = (sizeBytes / (1024 * 1024)).toFixed(2);
sizeInfo = ` (${sizeMB} MB)`;
Expand All @@ -201,26 +81,12 @@ async function checkBoltDbIntegrity(skip = false): Promise<boolean> {
);
}

// Skip integrity check for files that are too large for the memory-limited
// systemd scope. bbolt check mmaps the entire file, and with ~50-60 MB of
// Go runtime overhead the process will be OOM-killed for large files.
if (sizeBytes > BOLT_CHECK_MAX_FILE_BYTES) {
const sizeMB = (sizeBytes / (1024 * 1024)).toFixed(2);
core.info(
`${dbFile}: Skipping integrity check - file size ${sizeMB} MB exceeds limit (${BOLT_CHECK_MAX_FILE_BYTES / (1024 * 1024)} MB)`,
);
continue;
}

core.info(`Running bolt check on ${dbFile}${sizeInfo}...`);
const startTime = Date.now();

try {
const memoryMaxMB = BOLT_CHECK_MEMORY_MAX_BYTES / (1024 * 1024);
const { stdout: checkResult } = await execWithTimeout(
`sudo systemd-run --scope --quiet -p MemoryMax=${memoryMaxMB}M -p RuntimeMaxSec=6s bbolt check "${dbFile}" 2>&1`,
30_000,
`bbolt check ${dbFile}`,
const { stdout: checkResult } = await execAsync(
`sudo systemd-run --scope --quiet -p MemoryMax=512M -p RuntimeMaxSec=6s bbolt check "${dbFile}" 2>&1`,
);
const duration = Date.now() - startTime;
const durationSeconds = (duration / 1000).toFixed(2);
Expand All @@ -245,23 +111,18 @@ async function checkBoltDbIntegrity(skip = false): Promise<boolean> {
const exitCode = (checkError as { code?: number }).code;
const errorMessage = (checkError as Error).message;

// ExecTimeoutError = Promise.race timeout (process stuck in D state, e.g. Ceph partition)
if (checkError instanceof ExecTimeoutError) {
core.warning(
`⚠ ${dbFile}: Integrity check hit hard timeout after ${durationSeconds}s (possible I/O stall) - skipping`,
);
// Exit code 124 = timeout, 137 = SIGKILL (likely OOM), 143 = SIGTERM
} else if (exitCode === 124) {
// Exit code 124 = timeout, 137 = SIGKILL (likely OOM), 143 = SIGTERM
if (exitCode === 124) {
core.warning(
`⚠ ${dbFile}: Integrity check timed out after ${durationSeconds}s - skipping`,
`⚠ ${dbFile}: Integrity check timed out after ${durationSeconds}s - skipping (not counted as failure)`,
);
} else if (
exitCode === 137 ||
errorMessage.toLowerCase().includes("out of memory") ||
errorMessage.toLowerCase().includes("cannot allocate memory")
) {
core.warning(
`⚠ ${dbFile}: Integrity check hit memory limit - skipping`,
`⚠ ${dbFile}: Integrity check hit memory limit - skipping (not counted as failure)`,
);
} else {
core.warning(
Expand All @@ -286,12 +147,6 @@ async function checkBoltDbIntegrity(skip = false): Promise<boolean> {
return true;
}
} catch (error) {
if (error instanceof ExecTimeoutError) {
core.warning(
`Integrity check hit hard timeout during filesystem access (possible I/O stall) - skipping`,
);
return true;
}
core.info(
`/var/lib/buildkit directory not found, skipping database checks ${(error as Error).message}`,
);
Expand All @@ -312,23 +167,9 @@ export interface Inputs {
"github-token": string;
"skip-integrity-check": boolean;
"driver-opts": string[];
"max-parallelism": number | null;
}

async function getInputs(): Promise<Inputs> {
const maxParallelismInput = core.getInput("max-parallelism");
let maxParallelism: number | null = null;
if (maxParallelismInput) {
const parsed = parseInt(maxParallelismInput, 10);
if (!isNaN(parsed) && parsed > 0) {
maxParallelism = parsed;
} else {
core.warning(
`Invalid max-parallelism value '${maxParallelismInput}', ignoring. Must be a positive integer.`,
);
}
}

return {
"buildx-version": core.getInput("buildx-version"),
"buildkit-version": core.getInput("buildkit-version"),
Expand All @@ -340,7 +181,6 @@ async function getInputs(): Promise<Inputs> {
ignoreComma: true,
quote: false,
}),
"max-parallelism": maxParallelism,
};
}

Expand Down Expand Up @@ -455,14 +295,8 @@ async function startBlacksmithBuilder(
}
}

// Get CPU count for parallelism, allow user override via max-parallelism input
let parallelism = await getNumCPUs();
if (inputs["max-parallelism"] !== null) {
core.info(
`Overriding max-parallelism from ${parallelism} (nproc) to ${inputs["max-parallelism"]} (user-specified)`,
);
parallelism = inputs["max-parallelism"];
}
// Get CPU count for parallelism
const parallelism = await getNumCPUs();

// Check if buildkitd is already running before starting
try {
Expand Down Expand Up @@ -688,9 +522,43 @@ void actionsToolkit.run(

// Optional: Prune cache before shutdown (non-critical)
try {
// Capture cache state BEFORE prune for diagnostics
try {
core.info("=== BuildKit cache BEFORE prune ===");
const { stdout: duBeforeVerbose } = await execAsync(
`sudo buildctl --addr ${BUILDKIT_DAEMON_ADDR} du --verbose 2>&1 | tail -200`,
);
core.info(duBeforeVerbose);
const { stdout: duBeforeSummary } = await execAsync(
`sudo buildctl --addr ${BUILDKIT_DAEMON_ADDR} du 2>&1 | tail -5`,
);
core.info(`Cache summary before prune: ${duBeforeSummary}`);
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

New buildctl du diagnostics can hang cleanup

Medium Severity

The new pre/post-prune buildctl du diagnostics run without timeout limits. If buildkitd is unresponsive, these calls can block indefinitely and prevent prune, shutdown, unmount, and sticky-disk commit from completing.

Additional Locations (1)
Fix in Cursor Fix in Web

} catch (e) {
core.warning(
`Could not get pre-prune du: ${(e as Error).message}`,
);
}

core.info("Pruning BuildKit cache");
await pruneBuildkitCache();
core.info("BuildKit cache pruned");

// Capture cache state AFTER prune for diagnostics
try {
core.info("=== BuildKit cache AFTER prune ===");
const { stdout: duAfterVerbose } = await execAsync(
`sudo buildctl --addr ${BUILDKIT_DAEMON_ADDR} du --verbose 2>&1 | tail -200`,
);
core.info(duAfterVerbose);
const { stdout: duAfterSummary } = await execAsync(
`sudo buildctl --addr ${BUILDKIT_DAEMON_ADDR} du 2>&1 | tail -5`,
);
core.info(`Cache summary after prune: ${duAfterSummary}`);
} catch (e) {
core.warning(
`Could not get post-prune du: ${(e as Error).message}`,
);
}
} catch (error) {
core.warning(
`Error pruning BuildKit cache: ${(error as Error).message}`,
Expand Down Expand Up @@ -771,19 +639,6 @@ void actionsToolkit.run(
// Step 2: Sync and unmount sticky disk
await execAsync("sync");

// Get device path before unmount for durability flush
let devicePath: string | null = null;
try {
devicePath = await getDeviceFromMount(mountPoint);
if (devicePath) {
core.info(
`Found device ${devicePath} for mount point ${mountPoint}`,
);
}
} catch {
core.info(`Could not determine device for ${mountPoint}`);
}

try {
const { stdout: mountOutput } = await execAsync(
`mount | grep "${mountPoint}"`,
Expand Down Expand Up @@ -846,16 +701,6 @@ void actionsToolkit.run(
await new Promise((resolve) => setTimeout(resolve, 100));
}
}

// Flush block device buffers after unmount to ensure data durability
// before the Ceph RBD snapshot is taken. The device is still mapped even though unmounted.
if (devicePath) {
await flushBlockDevice(devicePath);
} else {
core.info(
"Skipping durability flush: device path not found for mount point",
);
}
} else {
core.debug("No sticky disk mount found");
}
Expand Down
Loading
Loading