From c614762b29e06f2a0dd86d18d8014e60a9fc11d2 Mon Sep 17 00:00:00 2001 From: ktechmidas <920871+ktechmidas@users.noreply.github.com> Date: Thu, 26 Feb 2026 14:54:43 +0300 Subject: [PATCH] fix(dashmate): prevent orphaned verification container blocking SSL renewal When ZeroSSL certificate renewal fails mid-pipeline (e.g. during domain verification or certificate download), the verification server container bound to port 80 is never cleaned up. This blocks all subsequent renewal attempts, causing certificates to expire across many nodes simultaneously if ZeroSSL experiences an API issue during the renewal window. Three fixes: 1. Wrap the obtain task's run() to ensure the verification server container is always stopped on failure, not just on success. 2. Add try/catch with 1-hour retry backoff to the ZeroSSL scheduler, matching the existing Let's Encrypt scheduler pattern. Previously, unhandled errors could crash silently or cause tight failure loops. 3. Add graceful shutdown and startup orphan cleanup to the helper process. On boot, any leftover verification containers from previous failed runs are force-removed before scheduling renewals. Co-Authored-By: Claude Opus 4.6 --- packages/dashmate/scripts/helper.js | 44 +++++++++++++++ .../scheduleRenewZeroSslCertificateFactory.js | 55 ++++++++++++++----- .../obtainZeroSSLCertificateTaskFactory.js | 28 +++++++++- 3 files changed, 111 insertions(+), 16 deletions(-) diff --git a/packages/dashmate/scripts/helper.js b/packages/dashmate/scripts/helper.js index 12a510e0a05..94eb1e95cd8 100644 --- a/packages/dashmate/scripts/helper.js +++ b/packages/dashmate/scripts/helper.js @@ -1,7 +1,14 @@ import dotenv from 'dotenv'; import { asValue } from 'awilix'; +import graceful from 'node-graceful'; import createDIContainer from '../src/createDIContainer.js'; +// Container names that may be left orphaned from failed SSL renewal attempts +const EPHEMERAL_SSL_CONTAINERS = [ + 'dashmate-zerossl-validation', + 'dashmate-letsencrypt-lego', +]; + (async function main() { // Read environment variables from .env file dotenv.config(); @@ -19,6 +26,43 @@ import createDIContainer from '../src/createDIContainer.js'; const container = await createDIContainer(process.env); + // Set up graceful shutdown to clean up any containers started during + // SSL certificate renewal (e.g. the ZeroSSL verification server on port 80) + const stopAllContainers = container.resolve('stopAllContainers'); + const startedContainers = container.resolve('startedContainers'); + + graceful.exitOnDouble = false; + graceful.on('exit', async () => { + // eslint-disable-next-line no-console + console.log('Shutting down dashmate helper, cleaning up containers...'); + + await stopAllContainers( + startedContainers.getContainers(), + { remove: true }, + ); + }); + + // Clean up any orphaned ephemeral SSL containers left from previous + // failed renewal attempts (e.g. if the helper crashed or was killed + // while a verification server was running on port 80) + const docker = container.resolve('docker'); + + await Promise.all(EPHEMERAL_SSL_CONTAINERS.map(async (name) => { + try { + const orphanedContainer = docker.getContainer(name); + await orphanedContainer.remove({ force: true }); + + // eslint-disable-next-line no-console + console.log(`Removed orphaned container: ${name}`); + } catch (e) { + // 404 means container doesn't exist — that's the normal case + if (e.statusCode !== 404) { + // eslint-disable-next-line no-console + console.error(`Failed to remove orphaned container ${name}: ${e.message}`); + } + } + })); + // Load configs /** * @type {ConfigFileJsonRepository} diff --git a/packages/dashmate/src/helper/scheduleRenewZeroSslCertificateFactory.js b/packages/dashmate/src/helper/scheduleRenewZeroSslCertificateFactory.js index 8d741511dd8..3f757b26aff 100644 --- a/packages/dashmate/src/helper/scheduleRenewZeroSslCertificateFactory.js +++ b/packages/dashmate/src/helper/scheduleRenewZeroSslCertificateFactory.js @@ -50,28 +50,53 @@ export default function scheduleRenewZeroSslCertificateFactory( console.log(`SSL certificate ${certificate.id} will expire at ${certificate.expires}. Schedule to obtain at ${expiresAt}.`); } + let renewalSucceeded = false; + const job = new CronJob(expiresAt, async () => { - const tasks = obtainZeroSSLCertificateTask(config); + try { + const tasks = obtainZeroSSLCertificateTask(config); + + await tasks.run({ + expirationDays: Certificate.EXPIRATION_LIMIT_DAYS, + noRetry: true, + }); + + // Write config files + configFileRepository.write(configFile); + writeConfigTemplates(config); - await tasks.run({ - expirationDays: Certificate.EXPIRATION_LIMIT_DAYS, - noRetry: true, - }); + // TODO: We can use https://www.envoyproxy.io/docs/envoy/v1.30.1/start/quick-start/configuration-dynamic-filesystem.html#start-quick-start-dynamic-fs-dynamic-lds + // to dynamically update envoy configuration without restarting it - // Write config files - configFileRepository.write(configFile); - writeConfigTemplates(config); + // Restart Gateway to catch up new SSL certificates + await dockerCompose.execCommand(config, 'gateway', 'kill -SIGHUP 1'); - // TODO: We can use https://www.envoyproxy.io/docs/envoy/v1.30.1/start/quick-start/configuration-dynamic-filesystem.html#start-quick-start-dynamic-fs-dynamic-lds - // to dynamically update envoy configuration without restarting it + // eslint-disable-next-line no-console + console.log('ZeroSSL certificate renewed successfully'); - // Restart Gateway to catch up new SSL certificates - await dockerCompose.execCommand(config, 'gateway', 'kill -SIGHUP 1'); + renewalSucceeded = true; + } catch (e) { + // eslint-disable-next-line no-console + console.error(`Failed to renew ZeroSSL certificate: ${e.message}`); - return job.stop(); + renewalSucceeded = false; + } + + job.stop(); }, async () => { - // Schedule new cron task - process.nextTick(() => scheduleRenewZeroSslCertificate(config)); + // Schedule new cron task after completion + if (renewalSucceeded) { + // Success: reschedule immediately to read new cert expiry + process.nextTick(() => scheduleRenewZeroSslCertificate(config)); + } else { + // Failure: wait 1 hour before retrying to avoid tight failure loops + // eslint-disable-next-line no-console + console.log('Scheduling ZeroSSL renewal retry in 1 hour'); + + setTimeout(() => { + scheduleRenewZeroSslCertificate(config); + }, 60 * 60 * 1000); + } }); job.start(); diff --git a/packages/dashmate/src/listr/tasks/ssl/zerossl/obtainZeroSSLCertificateTaskFactory.js b/packages/dashmate/src/listr/tasks/ssl/zerossl/obtainZeroSSLCertificateTaskFactory.js index 4636b37069b..4786261a9ae 100644 --- a/packages/dashmate/src/listr/tasks/ssl/zerossl/obtainZeroSSLCertificateTaskFactory.js +++ b/packages/dashmate/src/listr/tasks/ssl/zerossl/obtainZeroSSLCertificateTaskFactory.js @@ -43,7 +43,7 @@ export default function obtainZeroSSLCertificateTaskFactory( * @return {Listr} */ function obtainZeroSSLCertificateTask(config) { - return new Listr([ + const tasks = new Listr([ { title: 'Check if certificate already exists and not expiring soon', // Skips the check if force flag is set @@ -311,6 +311,32 @@ and all Dash service ports listed above.`); showErrorMessage: true, }, }); + + // Wrap run() to ensure the verification server is always cleaned up on failure. + // If a task after "Start verification server" throws (e.g. domain verification + // or certificate download fails), Listr aborts and the "Stop verification server" + // task at the end never executes — leaving an orphaned container bound to port 80. + // This wrapper guarantees cleanup regardless of where the pipeline fails. + const originalRun = tasks.run.bind(tasks); + tasks.run = async (context) => { + try { + return await originalRun(context); + } catch (error) { + try { + await verificationServer.stop(); + } catch { + // Ignore cleanup errors — server may not have been started + } + try { + await verificationServer.destroy(); + } catch { + // Ignore cleanup errors — server may not have been set up + } + throw error; + } + }; + + return tasks; } return obtainZeroSSLCertificateTask;