useblacksmith · ConnorMul · Mar 18, 2026 · Mar 18, 2026 · cursor · Mar 18, 2026
diff --git a/src/main.ts b/src/main.ts
@@ -12,18 +12,13 @@ import { exec } from "child_process";
 
 import * as stateHelper from "./state-helper";
 import * as reporter from "./reporter";
-import {
-  execWithTimeout,
-  ExecTimeoutError,
-  BOLT_CHECK_MEMORY_MAX_BYTES,
-  BOLT_CHECK_MAX_FILE_BYTES,
-} from "./exec-utils";
 import {
   setupStickyDisk,
   startAndConfigureBuildkitd,
   getNumCPUs,
   pruneBuildkitCache,
   logDatabaseHashes,
+  BUILDKIT_DAEMON_ADDR,
 } from "./setup_builder";
 import {
   installBuildKit,
@@ -38,112 +33,6 @@ const DEFAULT_BUILDX_VERSION = "v0.23.0";
 const mountPoint = "/var/lib/buildkit";
 const execAsync = promisify(exec);
 
-async function getDeviceFromMount(mountPath: string): Promise<string | null> {
-  try {
-    const { stdout } = await execAsync(`findmnt -n -o SOURCE "${mountPath}"`);
-    const device = stdout.trim();
-    if (device) {
-      // Log full mount info for debugging
-      try {
-        const { stdout: mountInfo } = await execAsync(
-          `findmnt -n -o SOURCE,FSTYPE,OPTIONS "${mountPath}"`,
-        );
-        core.info(`Mount info for ${mountPath}: ${mountInfo.trim()}`);
-      } catch {
-        // Ignore if we can't get full mount info
-      }
-      return device;
-    }
-  } catch {
-    core.info(`findmnt failed for ${mountPath}, trying mount command`);
-  }
-
-  try {
-    const { stdout } = await execAsync(`mount | grep " ${mountPath} "`);
-    const match = stdout.match(/^(\/dev\/\S+)/);
-    if (match) {
-      core.info(`Mount info for ${mountPath}: ${stdout.trim()}`);
-      return match[1];
-    }
-  } catch {
-    core.info(`mount grep failed for ${mountPath}`);
-  }
-
-  return null;
-}
-
-const FLUSH_TIMEOUT_SECS = 10;
-const TIMEOUT_EXIT_CODE = 124;
-
-async function flushBlockDevice(devicePath: string): Promise<void> {
-  const deviceName = devicePath.replace("/dev/", "");
-  if (!deviceName) {
-    core.warning(`Could not extract device name from ${devicePath}`);
-    return;
-  }
-
-  const statPath = `/sys/block/${deviceName}/stat`;
-
-  let beforeStats = "";
-  try {
-    const { stdout } = await execAsync(`cat ${statPath}`);
-    beforeStats = stdout.trim();
-  } catch {
-    core.warning(`Could not read block device stats before flush: ${statPath}`);
-  }
-
-  const startTime = Date.now();
-  try {
-    const { stdout, stderr } = await execAsync(
-      `timeout ${FLUSH_TIMEOUT_SECS} sudo blockdev --flushbufs ${devicePath}; echo "EXIT_CODE:$?"`,
-    );
-    const duration = Date.now() - startTime;
-
-    // Parse exit code from output
-    const exitCodeMatch = stdout.match(/EXIT_CODE:(\d+)/);
-    const exitCode = exitCodeMatch ? parseInt(exitCodeMatch[1], 10) : 0;
-
-    if (exitCode === TIMEOUT_EXIT_CODE) {
-      core.warning(
-        `guest flush timed out for ${devicePath} after ${FLUSH_TIMEOUT_SECS}s`,
-      );
-      return;
-    }
-
-    if (exitCode !== 0) {
-      core.warning(
-        `guest flush failed for ${devicePath} after ${duration}ms: exit code ${exitCode}, stderr: ${stderr}`,
-      );
-      return;
-    }
-
-    // Log stderr as warning even on success, in case there's useful diagnostic info
-    if (stderr && stderr.trim()) {
-      core.warning(`guest flush stderr (exit 0): ${stderr.trim()}`);
-    }
-
-    let afterStats = "";
-    try {
-      const { stdout } = await execAsync(`cat ${statPath}`);
-      afterStats = stdout.trim();
-    } catch {
-      core.warning(
-        `Could not read block device stats after flush: ${statPath}`,
-      );
-    }
-
-    core.info(
-      `guest flush duration: ${duration}ms, device: ${devicePath}, before_stats: ${beforeStats}, after_stats: ${afterStats}`,
-    );
-  } catch (error) {
-    const duration = Date.now() - startTime;
-    const errorMsg = error instanceof Error ? error.message : String(error);
-    core.warning(
-      `guest flush failed for ${devicePath} after ${duration}ms: ${errorMsg}`,
-    );
-  }
-}
-
 async function checkBoltDbIntegrity(skip = false): Promise<boolean> {
   if (skip) {
     core.info(
@@ -155,20 +44,14 @@ async function checkBoltDbIntegrity(skip = false): Promise<boolean> {
   try {
     // Check if /var/lib/buildkit directory exists
     try {
-      await execWithTimeout(
-        "test -d /var/lib/buildkit",
-        15_000,
-        "test buildkit dir exists",
-      );
+      await execAsync("test -d /var/lib/buildkit");
       core.debug(
         "Found /var/lib/buildkit directory, checking for database files",
       );
 
       // Find all *.db files in /var/lib/buildkit
-      const { stdout: dbFiles } = await execWithTimeout(
+      const { stdout: dbFiles } = await execAsync(
         "find /var/lib/buildkit -name '*.db' 2>/dev/null || true",
-        30_000,
-        "find db files",
       );
 
       if (dbFiles.trim()) {
@@ -183,14 +66,11 @@ async function checkBoltDbIntegrity(skip = false): Promise<boolean> {
             try {
               // Get file size
               let sizeInfo = "";
-              let sizeBytes = 0;
               try {
-                const { stdout: sizeOutput } = await execWithTimeout(
+                const { stdout: sizeOutput } = await execAsync(
                   `stat -c%s "${dbFile}" 2>/dev/null || stat -f%z "${dbFile}"`,
-                  15_000,
-                  `stat db file ${dbFile}`,
                 );
-                sizeBytes = parseInt(sizeOutput.trim(), 10);
+                const sizeBytes = parseInt(sizeOutput.trim(), 10);
                 if (!isNaN(sizeBytes) && sizeBytes > 0) {
                   const sizeMB = (sizeBytes / (1024 * 1024)).toFixed(2);
                   sizeInfo = ` (${sizeMB} MB)`;
@@ -201,26 +81,12 @@ async function checkBoltDbIntegrity(skip = false): Promise<boolean> {
                 );
               }
 
-              // Skip integrity check for files that are too large for the memory-limited
-              // systemd scope. bbolt check mmaps the entire file, and with ~50-60 MB of
-              // Go runtime overhead the process will be OOM-killed for large files.
-              if (sizeBytes > BOLT_CHECK_MAX_FILE_BYTES) {
-                const sizeMB = (sizeBytes / (1024 * 1024)).toFixed(2);
-                core.info(
-                  `${dbFile}: Skipping integrity check - file size ${sizeMB} MB exceeds limit (${BOLT_CHECK_MAX_FILE_BYTES / (1024 * 1024)} MB)`,
-                );
-                continue;
-              }
-
               core.info(`Running bolt check on ${dbFile}${sizeInfo}...`);
               const startTime = Date.now();
 
               try {
-                const memoryMaxMB = BOLT_CHECK_MEMORY_MAX_BYTES / (1024 * 1024);
-                const { stdout: checkResult } = await execWithTimeout(
-                  `sudo systemd-run --scope --quiet -p MemoryMax=${memoryMaxMB}M -p RuntimeMaxSec=6s bbolt check "${dbFile}" 2>&1`,
-                  30_000,
-                  `bbolt check ${dbFile}`,
+                const { stdout: checkResult } = await execAsync(
+                  `sudo systemd-run --scope --quiet -p MemoryMax=512M -p RuntimeMaxSec=6s bbolt check "${dbFile}" 2>&1`,
                 );
                 const duration = Date.now() - startTime;
                 const durationSeconds = (duration / 1000).toFixed(2);
@@ -245,23 +111,18 @@ async function checkBoltDbIntegrity(skip = false): Promise<boolean> {
                 const exitCode = (checkError as { code?: number }).code;
                 const errorMessage = (checkError as Error).message;
 
-                // ExecTimeoutError = Promise.race timeout (process stuck in D state, e.g. Ceph partition)
-                if (checkError instanceof ExecTimeoutError) {
-                  core.warning(
-                    `⚠ ${dbFile}: Integrity check hit hard timeout after ${durationSeconds}s (possible I/O stall) - skipping`,
-                  );
-                  // Exit code 124 = timeout, 137 = SIGKILL (likely OOM), 143 = SIGTERM
-                } else if (exitCode === 124) {
+                // Exit code 124 = timeout, 137 = SIGKILL (likely OOM), 143 = SIGTERM
+                if (exitCode === 124) {
                   core.warning(
-                    `⚠ ${dbFile}: Integrity check timed out after ${durationSeconds}s - skipping`,
+                    `⚠ ${dbFile}: Integrity check timed out after ${durationSeconds}s - skipping (not counted as failure)`,
                   );
                 } else if (
                   exitCode === 137 ||
                   errorMessage.toLowerCase().includes("out of memory") ||
                   errorMessage.toLowerCase().includes("cannot allocate memory")
                 ) {
                   core.warning(
-                    `⚠ ${dbFile}: Integrity check hit memory limit - skipping`,
+                    `⚠ ${dbFile}: Integrity check hit memory limit - skipping (not counted as failure)`,
                   );
                 } else {
                   core.warning(
@@ -286,12 +147,6 @@ async function checkBoltDbIntegrity(skip = false): Promise<boolean> {
         return true;
       }
     } catch (error) {
-      if (error instanceof ExecTimeoutError) {
-        core.warning(
-          `Integrity check hit hard timeout during filesystem access (possible I/O stall) - skipping`,
-        );
-        return true;
-      }
       core.info(
         `/var/lib/buildkit directory not found, skipping database checks ${(error as Error).message}`,
       );
@@ -312,23 +167,9 @@ export interface Inputs {
   "github-token": string;
   "skip-integrity-check": boolean;
   "driver-opts": string[];
-  "max-parallelism": number | null;
 }
 
 async function getInputs(): Promise<Inputs> {
-  const maxParallelismInput = core.getInput("max-parallelism");
-  let maxParallelism: number | null = null;
-  if (maxParallelismInput) {
-    const parsed = parseInt(maxParallelismInput, 10);
-    if (!isNaN(parsed) && parsed > 0) {
-      maxParallelism = parsed;
-    } else {
-      core.warning(
-        `Invalid max-parallelism value '${maxParallelismInput}', ignoring. Must be a positive integer.`,
-      );
-    }
-  }
-
   return {
     "buildx-version": core.getInput("buildx-version"),
     "buildkit-version": core.getInput("buildkit-version"),
@@ -340,7 +181,6 @@ async function getInputs(): Promise<Inputs> {
       ignoreComma: true,
       quote: false,
     }),
-    "max-parallelism": maxParallelism,
   };
 }
 
@@ -455,14 +295,8 @@ async function startBlacksmithBuilder(
       }
     }
 
-    // Get CPU count for parallelism, allow user override via max-parallelism input
-    let parallelism = await getNumCPUs();
-    if (inputs["max-parallelism"] !== null) {
-      core.info(
-        `Overriding max-parallelism from ${parallelism} (nproc) to ${inputs["max-parallelism"]} (user-specified)`,
-      );
-      parallelism = inputs["max-parallelism"];
-    }
+    // Get CPU count for parallelism
+    const parallelism = await getNumCPUs();
 
     // Check if buildkitd is already running before starting
     try {
@@ -688,9 +522,43 @@ void actionsToolkit.run(
 
             // Optional: Prune cache before shutdown (non-critical)
             try {
+              // Capture cache state BEFORE prune for diagnostics
+              try {
+                core.info("=== BuildKit cache BEFORE prune ===");
+                const { stdout: duBeforeVerbose } = await execAsync(
+                  `sudo buildctl --addr ${BUILDKIT_DAEMON_ADDR} du --verbose 2>&1 | tail -200`,
+                );
+                core.info(duBeforeVerbose);
+                const { stdout: duBeforeSummary } = await execAsync(
+                  `sudo buildctl --addr ${BUILDKIT_DAEMON_ADDR} du 2>&1 | tail -5`,
+                );
+                core.info(`Cache summary before prune: ${duBeforeSummary}`);
+              } catch (e) {
+                core.warning(
+                  `Could not get pre-prune du: ${(e as Error).message}`,
+                );
+              }
+
               core.info("Pruning BuildKit cache");
               await pruneBuildkitCache();
               core.info("BuildKit cache pruned");
+
+              // Capture cache state AFTER prune for diagnostics
+              try {
+                core.info("=== BuildKit cache AFTER prune ===");
+                const { stdout: duAfterVerbose } = await execAsync(
+                  `sudo buildctl --addr ${BUILDKIT_DAEMON_ADDR} du --verbose 2>&1 | tail -200`,
+                );
+                core.info(duAfterVerbose);
+                const { stdout: duAfterSummary } = await execAsync(
+                  `sudo buildctl --addr ${BUILDKIT_DAEMON_ADDR} du 2>&1 | tail -5`,
+                );
+                core.info(`Cache summary after prune: ${duAfterSummary}`);
+              } catch (e) {
+                core.warning(
+                  `Could not get post-prune du: ${(e as Error).message}`,
+                );
+              }
             } catch (error) {
               core.warning(
                 `Error pruning BuildKit cache: ${(error as Error).message}`,
@@ -771,19 +639,6 @@ void actionsToolkit.run(
         // Step 2: Sync and unmount sticky disk
         await execAsync("sync");
 
-        // Get device path before unmount for durability flush
-        let devicePath: string | null = null;
-        try {
-          devicePath = await getDeviceFromMount(mountPoint);
-          if (devicePath) {
-            core.info(
-              `Found device ${devicePath} for mount point ${mountPoint}`,
-            );
-          }
-        } catch {
-          core.info(`Could not determine device for ${mountPoint}`);
-        }
-
         try {
           const { stdout: mountOutput } = await execAsync(
             `mount | grep "${mountPoint}"`,
@@ -846,16 +701,6 @@ void actionsToolkit.run(
                 await new Promise((resolve) => setTimeout(resolve, 100));
               }
             }
-
-            // Flush block device buffers after unmount to ensure data durability
-            // before the Ceph RBD snapshot is taken. The device is still mapped even though unmounted.
-            if (devicePath) {
-              await flushBlockDevice(devicePath);
-            } else {
-              core.info(
-                "Skipping durability flush: device path not found for mount point",
-              );
-            }
           } else {
             core.debug("No sticky disk mount found");
           }