diff --git a/packages/dashmate/scripts/helper.js b/packages/dashmate/scripts/helper.js index 12a510e0a05..94eb1e95cd8 100644 --- a/packages/dashmate/scripts/helper.js +++ b/packages/dashmate/scripts/helper.js @@ -1,7 +1,14 @@ import dotenv from 'dotenv'; import { asValue } from 'awilix'; +import graceful from 'node-graceful'; import createDIContainer from '../src/createDIContainer.js'; +// Container names that may be left orphaned from failed SSL renewal attempts +const EPHEMERAL_SSL_CONTAINERS = [ + 'dashmate-zerossl-validation', + 'dashmate-letsencrypt-lego', +]; + (async function main() { // Read environment variables from .env file dotenv.config(); @@ -19,6 +26,43 @@ import createDIContainer from '../src/createDIContainer.js'; const container = await createDIContainer(process.env); + // Set up graceful shutdown to clean up any containers started during + // SSL certificate renewal (e.g. the ZeroSSL verification server on port 80) + const stopAllContainers = container.resolve('stopAllContainers'); + const startedContainers = container.resolve('startedContainers'); + + graceful.exitOnDouble = false; + graceful.on('exit', async () => { + // eslint-disable-next-line no-console + console.log('Shutting down dashmate helper, cleaning up containers...'); + + await stopAllContainers( + startedContainers.getContainers(), + { remove: true }, + ); + }); + + // Clean up any orphaned ephemeral SSL containers left from previous + // failed renewal attempts (e.g. if the helper crashed or was killed + // while a verification server was running on port 80) + const docker = container.resolve('docker'); + + await Promise.all(EPHEMERAL_SSL_CONTAINERS.map(async (name) => { + try { + const orphanedContainer = docker.getContainer(name); + await orphanedContainer.remove({ force: true }); + + // eslint-disable-next-line no-console + console.log(`Removed orphaned container: ${name}`); + } catch (e) { + // 404 means container doesn't exist — that's the normal case + if (e.statusCode !== 404) { + // eslint-disable-next-line no-console + console.error(`Failed to remove orphaned container ${name}: ${e.message}`); + } + } + })); + // Load configs /** * @type {ConfigFileJsonRepository} diff --git a/packages/dashmate/src/helper/scheduleRenewZeroSslCertificateFactory.js b/packages/dashmate/src/helper/scheduleRenewZeroSslCertificateFactory.js index 8d741511dd8..3f757b26aff 100644 --- a/packages/dashmate/src/helper/scheduleRenewZeroSslCertificateFactory.js +++ b/packages/dashmate/src/helper/scheduleRenewZeroSslCertificateFactory.js @@ -50,28 +50,53 @@ export default function scheduleRenewZeroSslCertificateFactory( console.log(`SSL certificate ${certificate.id} will expire at ${certificate.expires}. Schedule to obtain at ${expiresAt}.`); } + let renewalSucceeded = false; + const job = new CronJob(expiresAt, async () => { - const tasks = obtainZeroSSLCertificateTask(config); + try { + const tasks = obtainZeroSSLCertificateTask(config); + + await tasks.run({ + expirationDays: Certificate.EXPIRATION_LIMIT_DAYS, + noRetry: true, + }); + + // Write config files + configFileRepository.write(configFile); + writeConfigTemplates(config); - await tasks.run({ - expirationDays: Certificate.EXPIRATION_LIMIT_DAYS, - noRetry: true, - }); + // TODO: We can use https://www.envoyproxy.io/docs/envoy/v1.30.1/start/quick-start/configuration-dynamic-filesystem.html#start-quick-start-dynamic-fs-dynamic-lds + // to dynamically update envoy configuration without restarting it - // Write config files - configFileRepository.write(configFile); - writeConfigTemplates(config); + // Restart Gateway to catch up new SSL certificates + await dockerCompose.execCommand(config, 'gateway', 'kill -SIGHUP 1'); - // TODO: We can use https://www.envoyproxy.io/docs/envoy/v1.30.1/start/quick-start/configuration-dynamic-filesystem.html#start-quick-start-dynamic-fs-dynamic-lds - // to dynamically update envoy configuration without restarting it + // eslint-disable-next-line no-console + console.log('ZeroSSL certificate renewed successfully'); - // Restart Gateway to catch up new SSL certificates - await dockerCompose.execCommand(config, 'gateway', 'kill -SIGHUP 1'); + renewalSucceeded = true; + } catch (e) { + // eslint-disable-next-line no-console + console.error(`Failed to renew ZeroSSL certificate: ${e.message}`); - return job.stop(); + renewalSucceeded = false; + } + + job.stop(); }, async () => { - // Schedule new cron task - process.nextTick(() => scheduleRenewZeroSslCertificate(config)); + // Schedule new cron task after completion + if (renewalSucceeded) { + // Success: reschedule immediately to read new cert expiry + process.nextTick(() => scheduleRenewZeroSslCertificate(config)); + } else { + // Failure: wait 1 hour before retrying to avoid tight failure loops + // eslint-disable-next-line no-console + console.log('Scheduling ZeroSSL renewal retry in 1 hour'); + + setTimeout(() => { + scheduleRenewZeroSslCertificate(config); + }, 60 * 60 * 1000); + } }); job.start(); diff --git a/packages/dashmate/src/listr/tasks/ssl/zerossl/obtainZeroSSLCertificateTaskFactory.js b/packages/dashmate/src/listr/tasks/ssl/zerossl/obtainZeroSSLCertificateTaskFactory.js index 4636b37069b..4786261a9ae 100644 --- a/packages/dashmate/src/listr/tasks/ssl/zerossl/obtainZeroSSLCertificateTaskFactory.js +++ b/packages/dashmate/src/listr/tasks/ssl/zerossl/obtainZeroSSLCertificateTaskFactory.js @@ -43,7 +43,7 @@ export default function obtainZeroSSLCertificateTaskFactory( * @return {Listr} */ function obtainZeroSSLCertificateTask(config) { - return new Listr([ + const tasks = new Listr([ { title: 'Check if certificate already exists and not expiring soon', // Skips the check if force flag is set @@ -311,6 +311,32 @@ and all Dash service ports listed above.`); showErrorMessage: true, }, }); + + // Wrap run() to ensure the verification server is always cleaned up on failure. + // If a task after "Start verification server" throws (e.g. domain verification + // or certificate download fails), Listr aborts and the "Stop verification server" + // task at the end never executes — leaving an orphaned container bound to port 80. + // This wrapper guarantees cleanup regardless of where the pipeline fails. + const originalRun = tasks.run.bind(tasks); + tasks.run = async (context) => { + try { + return await originalRun(context); + } catch (error) { + try { + await verificationServer.stop(); + } catch { + // Ignore cleanup errors — server may not have been started + } + try { + await verificationServer.destroy(); + } catch { + // Ignore cleanup errors — server may not have been set up + } + throw error; + } + }; + + return tasks; } return obtainZeroSSLCertificateTask;