diff --git a/crates/openshell-bootstrap/src/docker.rs b/crates/openshell-bootstrap/src/docker.rs index 9c365bfe..c298fa89 100644 --- a/crates/openshell-bootstrap/src/docker.rs +++ b/crates/openshell-bootstrap/src/docker.rs @@ -40,6 +40,76 @@ fn env_bool(key: &str) -> Option { }) } +/// Parse a human-readable memory size string into bytes. +/// +/// Accepts integers (bytes) or values with `k`/`m`/`g`/`t` suffixes +/// (case-insensitive, with or without a trailing `b`). Binary units +/// (`ki`/`mi`/`gi`/`ti`) are also accepted. Examples: `80g`, `4096m`, +/// `0.5g`, `1073741824`. +/// +/// Returns an error if the value is empty, uses an unknown suffix, overflows +/// `i64`, or is below the 4 MiB minimum required by Docker. +pub fn parse_memory_limit(s: &str) -> Result { + let s = s.trim().to_ascii_lowercase(); + if s.is_empty() { + miette::bail!("empty memory limit string"); + } + + // Split into numeric part and optional suffix. + let (num_str, suffix) = match s.find(|c: char| !c.is_ascii_digit() && c != '.') { + Some(idx) => (&s[..idx], s[idx..].trim_end_matches('b')), + None => (s.as_str(), ""), + }; + + let value: f64 = num_str + .parse() + .into_diagnostic() + .wrap_err_with(|| format!("invalid numeric part in memory limit: {num_str}"))?; + + let multiplier: f64 = match suffix { + "" => 1.0, + "k" | "ki" => 1024.0, + "m" | "mi" => 1024.0 * 1024.0, + "g" | "gi" => 1024.0 * 1024.0 * 1024.0, + "t" | "ti" => 1024.0 * 1024.0 * 1024.0 * 1024.0, + other => miette::bail!("unknown memory suffix: {other}"), + }; + + let raw = value * multiplier; + if raw > i64::MAX as f64 { + miette::bail!("memory limit too large (exceeds i64::MAX): {s}"); + } + #[allow(clippy::cast_possible_truncation)] + let bytes = raw as i64; + + // Docker requires at least ~6 MiB; enforce a 4 MiB floor so users get a + // clear error instead of an opaque Docker API rejection. + const MIN_MEMORY_BYTES: i64 = 4 * 1024 * 1024; + if bytes < MIN_MEMORY_BYTES { + miette::bail!("memory limit must be at least 4 MiB, got: {s} ({bytes} bytes)"); + } + Ok(bytes) +} + +/// Detect a safe memory limit for the gateway container. +/// +/// Queries the Docker daemon for `MemTotal` (via `docker info`) and returns +/// 80% of that value. On macOS and Windows the daemon runs inside a Linux VM +/// (Docker Desktop, colima, WSL2), so the reported total reflects the VM's +/// allocated memory rather than the full host RAM. +/// +/// Returns `None` if the daemon does not report memory information. +pub async fn detect_memory_limit(docker: &Docker) -> Option { + let info = docker.info().await.ok()?; + let total_bytes = info.mem_total?; + if total_bytes <= 0 { + return None; + } + #[allow(clippy::cast_possible_truncation)] + let limit = (total_bytes as f64 * 0.8) as i64; + Some(limit) +} + /// Platform information for a Docker daemon host. #[derive(Debug, Clone)] pub struct HostPlatform { @@ -236,6 +306,62 @@ fn home_dir() -> Option { std::env::var("HOME").ok() } +/// Discover upstream DNS resolvers from systemd-resolved's configuration. +/// +/// Only reads `/run/systemd/resolve/resolv.conf` — the upstream resolver file +/// maintained by systemd-resolved. This file is only present on Linux hosts +/// running systemd-resolved (e.g., Ubuntu), so the function is a no-op on +/// macOS, Windows/WSL, and non-systemd Linux distributions. +/// +/// We intentionally do NOT fall back to `/etc/resolv.conf` here. On Docker +/// Desktop (macOS/Windows), `/etc/resolv.conf` may contain non-loopback +/// resolvers that appear valid but are unreachable via direct UDP from inside +/// the container's network stack. Those environments rely on the entrypoint's +/// iptables DNAT proxy to Docker's embedded DNS — sniffing host resolvers +/// would bypass that proxy and break DNS. +/// +/// Returns an empty vec if no usable resolvers are found. +/// Parse resolv.conf content, extracting nameserver IPs and filtering loopback addresses. +fn parse_resolv_conf(contents: &str) -> Vec { + contents + .lines() + .filter_map(|line| { + let line = line.trim(); + if !line.starts_with("nameserver") { + return None; + } + let ip = line.split_whitespace().nth(1)?; + if ip.starts_with("127.") || ip == "::1" { + return None; + } + Some(ip.to_string()) + }) + .collect() +} + +fn resolve_upstream_dns() -> Vec { + let paths = ["/run/systemd/resolve/resolv.conf"]; + + for path in &paths { + if let Ok(contents) = std::fs::read_to_string(path) { + let resolvers = parse_resolv_conf(&contents); + + if !resolvers.is_empty() { + tracing::debug!( + "Discovered {} upstream DNS resolver(s) from {}: {}", + resolvers.len(), + path, + resolvers.join(", "), + ); + return resolvers; + } + } + } + + tracing::debug!("No upstream DNS resolvers found in host resolver config"); + Vec::new() +} + /// Create an SSH Docker client from remote options. pub async fn create_ssh_docker_client(remote: &RemoteOptions) -> Result { // Ensure destination has ssh:// prefix @@ -455,6 +581,8 @@ pub async fn ensure_container( registry_username: Option<&str>, registry_token: Option<&str>, gpu: bool, + is_remote: bool, + memory_limit: Option, ) -> Result<()> { let container_name = container_name(name); @@ -559,6 +687,15 @@ pub async fn ensure_container( }]); } + // Apply memory limit. When set, Docker OOM-kills the container instead of + // letting unchecked sandbox growth trigger the host kernel OOM killer. + // Setting memory_swap equal to memory disables swap inside the container. + if let Some(mem) = memory_limit { + host_config.memory = Some(mem); + host_config.memory_swap = Some(mem); + tracing::info!("Container memory limit: {} MiB", mem / (1024 * 1024),); + } + let mut cmd = vec![ "server".to_string(), "--disable=traefik".to_string(), @@ -675,6 +812,17 @@ pub async fn ensure_container( env_vars.push("GPU_ENABLED=true".to_string()); } + // Pass upstream DNS resolvers discovered on the host so the entrypoint + // can configure k3s without probing files inside the container. + // Skip for remote deploys — the local host's resolvers are likely wrong + // for the remote Docker host (different network, split-horizon DNS, etc.). + if !is_remote { + let upstream_dns = resolve_upstream_dns(); + if !upstream_dns.is_empty() { + env_vars.push(format!("UPSTREAM_DNS={}", upstream_dns.join(","))); + } + } + let env = Some(env_vars); let config = ContainerCreateBody { @@ -1195,4 +1343,166 @@ mod tests { "should return a reasonable number of sockets" ); } + + #[test] + fn resolve_upstream_dns_filters_loopback() { + // This test validates the function runs without panic on the current host. + // The exact output depends on the host's DNS config, but loopback + // addresses must never appear in the result. + let resolvers = resolve_upstream_dns(); + for r in &resolvers { + assert!( + !r.starts_with("127."), + "IPv4 loopback should be filtered: {r}" + ); + assert_ne!(r, "::1", "IPv6 loopback should be filtered"); + } + } + + #[test] + fn resolve_upstream_dns_returns_vec() { + // Verify the function returns a vec (may be empty in some CI environments + // where no resolv.conf exists, but should never panic). + let resolvers = resolve_upstream_dns(); + assert!( + resolvers.len() <= 20, + "should return a reasonable number of resolvers" + ); + } + + #[test] + fn parse_resolv_conf_filters_ipv4_loopback() { + let input = "nameserver 127.0.0.1\nnameserver 127.0.0.53\nnameserver 127.0.0.11\n"; + assert!(parse_resolv_conf(input).is_empty()); + } + + #[test] + fn parse_resolv_conf_filters_ipv6_loopback() { + let input = "nameserver ::1\n"; + assert!(parse_resolv_conf(input).is_empty()); + } + + #[test] + fn parse_resolv_conf_passes_real_resolvers() { + let input = "nameserver 8.8.8.8\nnameserver 1.1.1.1\n"; + assert_eq!(parse_resolv_conf(input), vec!["8.8.8.8", "1.1.1.1"]); + } + + #[test] + fn parse_resolv_conf_mixed_loopback_and_real() { + let input = + "nameserver 127.0.0.53\nnameserver ::1\nnameserver 10.0.0.1\nnameserver 172.16.0.1\n"; + assert_eq!(parse_resolv_conf(input), vec!["10.0.0.1", "172.16.0.1"]); + } + + #[test] + fn parse_resolv_conf_ignores_comments_and_other_lines() { + let input = + "# nameserver 8.8.8.8\nsearch example.com\noptions ndots:5\nnameserver 1.1.1.1\n"; + assert_eq!(parse_resolv_conf(input), vec!["1.1.1.1"]); + } + + #[test] + fn parse_resolv_conf_handles_tabs_and_extra_spaces() { + let input = "nameserver\t8.8.8.8\nnameserver 1.1.1.1\n"; + assert_eq!(parse_resolv_conf(input), vec!["8.8.8.8", "1.1.1.1"]); + } + + #[test] + fn parse_resolv_conf_empty_input() { + assert!(parse_resolv_conf("").is_empty()); + assert!(parse_resolv_conf(" \n\n").is_empty()); + } + + #[test] + fn parse_resolv_conf_bare_nameserver_keyword() { + assert!(parse_resolv_conf("nameserver\n").is_empty()); + assert!(parse_resolv_conf("nameserver \n").is_empty()); + } + + #[test] + fn parse_resolv_conf_systemd_resolved_typical() { + let input = + "# This is /run/systemd/resolve/resolv.conf\nnameserver 192.168.1.1\nsearch lan\n"; + assert_eq!(parse_resolv_conf(input), vec!["192.168.1.1"]); + } + + #[test] + fn parse_resolv_conf_crlf_line_endings() { + let input = "nameserver 8.8.8.8\r\nnameserver 1.1.1.1\r\n"; + assert_eq!(parse_resolv_conf(input), vec!["8.8.8.8", "1.1.1.1"]); + } + + #[test] + fn parse_memory_limit_gigabytes() { + assert_eq!(parse_memory_limit("80g").unwrap(), 80 * 1024 * 1024 * 1024); + assert_eq!(parse_memory_limit("80G").unwrap(), 80 * 1024 * 1024 * 1024); + assert_eq!(parse_memory_limit("80gb").unwrap(), 80 * 1024 * 1024 * 1024); + } + + #[test] + fn parse_memory_limit_megabytes() { + assert_eq!(parse_memory_limit("4096m").unwrap(), 4096 * 1024 * 1024); + assert_eq!(parse_memory_limit("4096M").unwrap(), 4096 * 1024 * 1024); + } + + #[test] + fn parse_memory_limit_bare_bytes() { + assert_eq!(parse_memory_limit("1073741824").unwrap(), 1073741824); + } + + #[test] + fn parse_memory_limit_binary_suffixes() { + assert_eq!(parse_memory_limit("1gi").unwrap(), 1024 * 1024 * 1024); + assert_eq!(parse_memory_limit("1gib").unwrap(), 1024 * 1024 * 1024); + } + + #[test] + fn parse_memory_limit_rejects_empty() { + assert!(parse_memory_limit("").is_err()); + } + + #[test] + fn parse_memory_limit_rejects_unknown_suffix() { + assert!(parse_memory_limit("10x").is_err()); + } + + #[test] + fn parse_memory_limit_fractional() { + // 0.5g = 512 MiB + assert_eq!(parse_memory_limit("0.5g").unwrap(), 512 * 1024 * 1024); + } + + #[test] + fn parse_memory_limit_rejects_zero() { + assert!(parse_memory_limit("0g").is_err()); + } + + #[test] + fn parse_memory_limit_rejects_negative() { + assert!(parse_memory_limit("-1g").is_err()); + } + + #[test] + fn parse_memory_limit_rejects_below_minimum() { + // 1 KiB is well below the 4 MiB floor + assert!(parse_memory_limit("1k").is_err()); + } + + #[test] + fn parse_memory_limit_rejects_overflow() { + // 99999999t exceeds i64::MAX (~9.2 exabytes) + assert!(parse_memory_limit("99999999t").is_err()); + } + + #[test] + fn parse_memory_limit_whitespace() { + assert_eq!( + parse_memory_limit(" 80g ").unwrap(), + 80 * 1024 * 1024 * 1024 + ); + } + + // detect_memory_limit is async and requires a Docker daemon connection, + // so it is tested via integration / e2e tests rather than unit tests. } diff --git a/crates/openshell-bootstrap/src/lib.rs b/crates/openshell-bootstrap/src/lib.rs index 9098fd4a..8e7bcdb7 100644 --- a/crates/openshell-bootstrap/src/lib.rs +++ b/crates/openshell-bootstrap/src/lib.rs @@ -46,6 +46,7 @@ use crate::runtime::{ pub use crate::constants::container_name; pub use crate::docker::{ DockerPreflight, ExistingGatewayInfo, check_docker_available, create_ssh_docker_client, + detect_memory_limit, parse_memory_limit, }; pub use crate::metadata::{ GatewayMetadata, clear_active_gateway, extract_host_from_ssh_destination, get_gateway_metadata, @@ -119,6 +120,11 @@ pub struct DeployOptions { /// When false, an existing gateway is left as-is and deployment is /// skipped (the caller is responsible for prompting the user first). pub recreate: bool, + /// Memory limit for the gateway container in bytes. When set, Docker + /// enforces the ceiling and OOM-kills the container instead of the host + /// kernel OOM-killing unrelated processes. When `None`, auto-detected + /// as 80% of available memory via the Docker daemon. + pub memory_limit: Option, } impl DeployOptions { @@ -135,6 +141,7 @@ impl DeployOptions { registry_token: None, gpu: false, recreate: false, + memory_limit: None, } } @@ -200,6 +207,13 @@ impl DeployOptions { self.recreate = recreate; self } + + /// Set the memory limit for the gateway container in bytes. + #[must_use] + pub fn with_memory_limit(mut self, limit: i64) -> Self { + self.memory_limit = Some(limit); + self + } } #[derive(Debug, Clone)] @@ -264,6 +278,7 @@ where let registry_token = options.registry_token; let gpu = options.gpu; let recreate = options.recreate; + let explicit_memory_limit = options.memory_limit; // Wrap on_log in Arc> so we can share it with pull_remote_image // which needs a 'static callback for the bollard streaming pull. @@ -288,6 +303,14 @@ where (preflight.docker, None) }; + // Resolve memory limit: explicit value from CLI, or auto-detect from the + // Docker daemon. On macOS / Windows this correctly reports the Docker + // Desktop VM's memory, not the full host RAM. + let memory_limit = match explicit_memory_limit { + Some(limit) => Some(limit), + None => detect_memory_limit(&target_docker).await, + }; + // If an existing gateway is found, either tear it down (when recreate is // requested) or bail out so the caller can prompt the user / reuse it. if let Some(existing) = check_existing_gateway(&target_docker, &name).await? { @@ -417,6 +440,8 @@ where registry_username.as_deref(), registry_token.as_deref(), gpu, + remote_opts.is_some(), + memory_limit, ) .await?; start_container(&target_docker, &name).await?; diff --git a/crates/openshell-cli/src/bootstrap.rs b/crates/openshell-cli/src/bootstrap.rs index e976061f..fdaedb9d 100644 --- a/crates/openshell-cli/src/bootstrap.rs +++ b/crates/openshell-cli/src/bootstrap.rs @@ -179,6 +179,15 @@ pub async fn run_bootstrap( options = options.with_gateway_host(host); } options = options.with_gpu(gpu); + // Read memory limit override from environment. The explicit `--memory` + // flag is only on `gateway start`; this env var covers the auto-bootstrap + // path triggered by `sandbox create`. + if let Ok(mem_str) = std::env::var("OPENSHELL_MEMORY_LIMIT") + && !mem_str.trim().is_empty() + { + let limit = openshell_bootstrap::parse_memory_limit(&mem_str)?; + options = options.with_memory_limit(limit); + } let handle = deploy_gateway_with_panel(options, &gateway_name, location).await?; let server = handle.gateway_endpoint().to_string(); diff --git a/crates/openshell-cli/src/main.rs b/crates/openshell-cli/src/main.rs index 3799b392..e78d2b6e 100644 --- a/crates/openshell-cli/src/main.rs +++ b/crates/openshell-cli/src/main.rs @@ -809,6 +809,17 @@ enum GatewayCommands { /// NVIDIA Container Toolkit on the host. #[arg(long)] gpu: bool, + + /// Memory limit for the gateway container. + /// + /// Accepts human-readable sizes: `80g`, `4096m`, `1073741824` (bytes). + /// When unset, defaults to 80% of available memory (auto-detected via + /// the Docker daemon). On macOS and Windows this reflects the Docker + /// Desktop VM's allocated memory, not the full host RAM. Docker + /// OOM-kills the container if it exceeds this limit, preventing + /// runaway sandbox growth from triggering the host kernel OOM killer. + #[arg(long, env = "OPENSHELL_MEMORY_LIMIT")] + memory: Option, }, /// Stop the gateway (preserves state). @@ -1561,7 +1572,12 @@ async fn main() -> Result<()> { registry_username, registry_token, gpu, + memory, } => { + let memory_limit = memory + .as_deref() + .map(openshell_bootstrap::parse_memory_limit) + .transpose()?; run::gateway_admin_deploy( &name, remote.as_deref(), @@ -1574,6 +1590,7 @@ async fn main() -> Result<()> { registry_username.as_deref(), registry_token.as_deref(), gpu, + memory_limit, ) .await?; } diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs index a4331ee5..630a2195 100644 --- a/crates/openshell-cli/src/run.rs +++ b/crates/openshell-cli/src/run.rs @@ -1356,6 +1356,7 @@ pub async fn gateway_admin_deploy( registry_username: Option<&str>, registry_token: Option<&str>, gpu: bool, + memory_limit: Option, ) -> Result<()> { let location = if remote.is_some() { "remote" } else { "local" }; @@ -1421,6 +1422,9 @@ pub async fn gateway_admin_deploy( .with_disable_gateway_auth(disable_gateway_auth) .with_gpu(gpu) .with_recreate(should_recreate); + if let Some(mem) = memory_limit { + options = options.with_memory_limit(mem); + } if let Some(opts) = remote_opts { options = options.with_remote(opts); } diff --git a/deploy/docker/cluster-entrypoint.sh b/deploy/docker/cluster-entrypoint.sh index 84b8cf9a..0e75e67c 100644 --- a/deploy/docker/cluster-entrypoint.sh +++ b/deploy/docker/cluster-entrypoint.sh @@ -69,7 +69,46 @@ wait_for_default_route() { # 3. Adding DNAT rules so traffic to :53 reaches Docker's DNS # 4. Writing that IP into the k3s resolv.conf +# Extract upstream DNS resolvers reachable from k3s pod namespaces. +# Docker's embedded DNS (127.0.0.11) is namespace-local — DNAT to it from +# pod traffic is dropped as a martian packet. Use real upstream servers instead. +# +# Priority: +# 1. UPSTREAM_DNS env var (set by bootstrap, comma-separated) +# 2. /etc/resolv.conf (fallback for non-bootstrap launches) +get_upstream_resolvers() { + local resolvers="" + + # Bootstrap-provided resolvers (sniffed from host by the Rust bootstrap crate) + if [ -n "${UPSTREAM_DNS:-}" ]; then + resolvers=$(printf '%s\n' "$UPSTREAM_DNS" | tr ',' '\n' | \ + awk '{ip=$1; if(ip !~ /^127\./ && ip != "::1" && ip != "") print ip}') + fi + + # Fallback: Docker-generated resolv.conf may have non-loopback servers + if [ -z "$resolvers" ]; then + resolvers=$(awk '/^nameserver/{ip=$2; gsub(/\r/,"",ip); if(ip !~ /^127\./ && ip != "::1") print ip}' \ + /etc/resolv.conf) + fi + + echo "$resolvers" +} + setup_dns_proxy() { + # Prefer upstream resolvers that work across network namespaces. + # This avoids the DNAT-to-loopback problem on systemd-resolved hosts. + UPSTREAM_DNS=$(get_upstream_resolvers) + if [ -n "$UPSTREAM_DNS" ]; then + : > "$RESOLV_CONF" + echo "$UPSTREAM_DNS" | while read -r ns; do + [ -n "$ns" ] && echo "nameserver $ns" >> "$RESOLV_CONF" + done + echo "DNS: using upstream resolvers directly (avoids cross-namespace DNAT)" + cat "$RESOLV_CONF" + return 0 + fi + + # Fall back to DNAT proxy when no upstream resolvers are available. # Extract Docker's actual DNS listener ports from the DOCKER_OUTPUT chain. # Docker sets up rules like: # -A DOCKER_OUTPUT -d 127.0.0.11/32 -p udp --dport 53 -j DNAT --to-destination 127.0.0.11: @@ -160,6 +199,8 @@ verify_dns() { sleep 1 i=$((i + 1)) done + echo "Warning: DNS verification failed for $lookup_host after $attempts attempts" + echo " resolv.conf: $(head -3 "$RESOLV_CONF" 2>/dev/null)" return 1 }