diff --git a/.agents/skills/debug-openshell-cluster/SKILL.md b/.agents/skills/debug-openshell-cluster/SKILL.md index 4d0e4659..5af8895c 100644 --- a/.agents/skills/debug-openshell-cluster/SKILL.md +++ b/.agents/skills/debug-openshell-cluster/SKILL.md @@ -104,6 +104,7 @@ Look for: - k3s startup errors (certificate issues, port binding failures) - Manifest copy errors from `/opt/openshell/manifests/` - `iptables` or `cgroup` errors (privilege/capability issues) +- `Warning: br_netfilter does not appear to be loaded` — this is advisory only; many kernels work without the explicit module. Only act on it if you also see DNS failures or pod-to-service connectivity problems (see Common Failure Patterns). ### Step 2: Check k3s Cluster Health @@ -308,6 +309,7 @@ If DNS is broken, all image pulls from the distribution registry will fail, as w | Port conflict | Another service on the configured gateway host port (default 8080) | Stop conflicting service or use `--port` on `openshell gateway start` to pick a different host port | | gRPC connect refused to `127.0.0.1:443` in CI | Docker daemon is remote (`DOCKER_HOST=tcp://...`) but metadata still points to loopback | Verify metadata endpoint host matches `DOCKER_HOST` and includes non-loopback host | | DNS failures inside container | Entrypoint DNS detection failed | `openshell doctor exec -- cat /etc/rancher/k3s/resolv.conf` and `openshell doctor logs --lines 20` | +| Pods can't reach kube-dns / ClusterIP services | `br_netfilter` not loaded; bridge traffic bypasses iptables DNAT rules | `sudo modprobe br_netfilter` on the host, then `echo br_netfilter \| sudo tee /etc/modules-load.d/br_netfilter.conf` to persist. Known to be required on Jetson Linux 5.15-tegra; other kernels (e.g. standard x86/aarch64 Linux) may have bridge netfilter built in and work without the module. The entrypoint logs a warning when `/proc/sys/net/bridge/bridge-nf-call-iptables` is absent but does not abort — only act on it if DNS or service connectivity is actually broken. | | Node DiskPressure / MemoryPressure / PIDPressure | Insufficient disk, memory, or PIDs on host | Free disk (`docker system prune -a --volumes`), increase memory, or expand host resources. Bootstrap auto-detects via `HEALTHCHECK_NODE_PRESSURE` marker | | Pods evicted with "The node had condition: [DiskPressure]" | Host disk full, kubelet evicting pods | Free disk space on host, then `openshell gateway destroy && openshell gateway start` | | `metrics-server` errors in logs | Normal k3s noise, not the root cause | These errors are benign — look for the actual failing health check component | diff --git a/crates/openshell-sandbox/src/sandbox/linux/netns.rs b/crates/openshell-sandbox/src/sandbox/linux/netns.rs index 5e6907c5..095ed86c 100644 --- a/crates/openshell-sandbox/src/sandbox/linux/netns.rs +++ b/crates/openshell-sandbox/src/sandbox/linux/netns.rs @@ -262,15 +262,18 @@ impl NetworkNamespace { info!( namespace = %self.name, - iptables = iptables_path, + iptables = %iptables_path, proxy_addr = %format!("{}:{}", host_ip_str, proxy_port), "Installing bypass detection rules" ); // Install IPv4 rules - if let Err(e) = - self.install_bypass_rules_for(iptables_path, &host_ip_str, &proxy_port_str, &log_prefix) - { + if let Err(e) = self.install_bypass_rules_for( + &iptables_path, + &host_ip_str, + &proxy_port_str, + &log_prefix, + ) { warn!( namespace = %self.name, error = %e, @@ -281,7 +284,7 @@ impl NetworkNamespace { // Install IPv6 rules — best-effort. // Skip the proxy ACCEPT rule for IPv6 since the proxy address is IPv4. - if let Some(ip6_path) = find_ip6tables(iptables_path) { + if let Some(ip6_path) = find_ip6tables(&iptables_path) { if let Err(e) = self.install_bypass_rules_for_v6(&ip6_path, &log_prefix) { warn!( namespace = %self.name, @@ -666,12 +669,92 @@ fn run_iptables_netns(netns: &str, iptables_cmd: &str, args: &[&str]) -> Result< const IPTABLES_SEARCH_PATHS: &[&str] = &["/usr/sbin/iptables", "/sbin/iptables", "/usr/bin/iptables"]; +/// Returns true if xt extension modules (e.g. xt_comment) cannot be used +/// via the given iptables binary. +/// +/// Some kernels have nf_tables but lack the nft_compat bridge that allows +/// xt extension modules to be used through the nf_tables path (e.g. Jetson +/// Linux 5.15-tegra). This probe detects that condition by attempting to +/// insert a rule using the xt_comment extension. If it fails, xt extensions +/// are unavailable and the caller should fall back to iptables-legacy. +fn xt_extensions_unavailable(iptables_path: &str) -> bool { + // Create a temporary probe chain. If this fails (e.g. no CAP_NET_ADMIN), + // we can't determine availability — assume extensions are available. + let created = Command::new(iptables_path) + .args(["-t", "filter", "-N", "_xt_probe"]) + .output() + .map(|o| o.status.success()) + .unwrap_or(false); + + if !created { + return false; + } + + // Attempt to insert a rule using xt_comment. Failure means nft_compat + // cannot bridge xt extension modules on this kernel. + let probe_ok = Command::new(iptables_path) + .args([ + "-t", + "filter", + "-A", + "_xt_probe", + "-m", + "comment", + "--comment", + "probe", + "-j", + "ACCEPT", + ]) + .output() + .map(|o| o.status.success()) + .unwrap_or(false); + + // Clean up — best-effort, ignore failures. + let _ = Command::new(iptables_path) + .args([ + "-t", + "filter", + "-D", + "_xt_probe", + "-m", + "comment", + "--comment", + "probe", + "-j", + "ACCEPT", + ]) + .output(); + let _ = Command::new(iptables_path) + .args(["-t", "filter", "-X", "_xt_probe"]) + .output(); + + !probe_ok +} + /// Find the iptables binary path, checking well-known locations. -fn find_iptables() -> Option<&'static str> { - IPTABLES_SEARCH_PATHS +/// +/// If xt extension modules are unavailable via the standard binary and +/// `iptables-legacy` is available alongside it, the legacy binary is returned +/// instead. This ensures bypass-detection rules can be installed on kernels +/// where `nft_compat` is unavailable (e.g. Jetson Linux 5.15-tegra). +fn find_iptables() -> Option { + let standard_path = IPTABLES_SEARCH_PATHS .iter() .find(|path| std::path::Path::new(path).exists()) - .copied() + .copied()?; + + if xt_extensions_unavailable(standard_path) { + let legacy_path = standard_path.replace("iptables", "iptables-legacy"); + if std::path::Path::new(&legacy_path).exists() { + debug!( + legacy = legacy_path, + "xt extensions unavailable; using iptables-legacy" + ); + return Some(legacy_path); + } + } + + Some(standard_path.to_string()) } /// Find the ip6tables binary path, deriving it from the iptables location. diff --git a/deploy/docker/cluster-entrypoint.sh b/deploy/docker/cluster-entrypoint.sh index 84b8cf9a..2fea6fa6 100644 --- a/deploy/docker/cluster-entrypoint.sh +++ b/deploy/docker/cluster-entrypoint.sh @@ -25,6 +25,61 @@ set -e +# --------------------------------------------------------------------------- +# Select iptables backend +# --------------------------------------------------------------------------- +# Some kernels (e.g. Jetson Linux 5.15-tegra) have the nf_tables subsystem +# but lack the nft_compat bridge that allows flannel and kube-proxy to use +# xt extension modules (xt_comment, xt_conntrack). Detect this by probing +# whether xt_comment is usable via the current iptables backend. If the +# probe fails, switch to iptables-legacy. Set USE_IPTABLES_LEGACY=1 +# externally to skip the probe and force the legacy backend. +# --------------------------------------------------------------------------- +# Check br_netfilter kernel module +# --------------------------------------------------------------------------- +# br_netfilter makes the kernel pass bridge (pod-to-pod) traffic through +# iptables. Without it, kube-proxy's DNAT rules for ClusterIP services are +# never applied to pod traffic, so pods cannot reach services such as +# kube-dns (10.43.0.10), breaking all in-cluster DNS resolution. +# +# The module must be loaded on the HOST before the container starts — +# containers cannot load kernel modules themselves. If it is missing, log a +# warning rather than failing hard: some kernels have bridge netfilter support +# built-in or expose it differently, and will work correctly without the module +# being explicitly loaded as a separate .ko. +if [ ! -f /proc/sys/net/bridge/bridge-nf-call-iptables ]; then + echo "Warning: br_netfilter does not appear to be loaded on the host." >&2 + echo " Pod-to-service networking (including kube-dns) may not work without it." >&2 + echo " If the cluster fails to start or DNS is broken, try loading it on the host:" >&2 + echo " sudo modprobe br_netfilter" >&2 + echo " To persist across reboots:" >&2 + echo " echo br_netfilter | sudo tee /etc/modules-load.d/br_netfilter.conf" >&2 +fi + +if [ -z "${USE_IPTABLES_LEGACY:-}" ]; then + if iptables -t filter -N _xt_probe 2>/dev/null; then + _probe_rc=0 + iptables -t filter -A _xt_probe -m comment --comment "probe" -j ACCEPT \ + 2>/dev/null || _probe_rc=$? + iptables -t filter -D _xt_probe -m comment --comment "probe" -j ACCEPT \ + 2>/dev/null || true + iptables -t filter -X _xt_probe 2>/dev/null || true + [ "$_probe_rc" -ne 0 ] && USE_IPTABLES_LEGACY=1 + fi +fi + +if [ "${USE_IPTABLES_LEGACY:-0}" = "1" ]; then + echo "iptables nf_tables xt extension bridge unavailable — switching to iptables-legacy" + if update-alternatives --set iptables /usr/sbin/iptables-legacy 2>/dev/null && \ + update-alternatives --set ip6tables /usr/sbin/ip6tables-legacy 2>/dev/null; then + echo "Now using iptables-legacy mode" + else + echo "Warning: could not switch to iptables-legacy — cluster networking may fail" + fi +fi + +IPTABLES=$([ "${USE_IPTABLES_LEGACY:-0}" = "1" ] && echo iptables-legacy || echo iptables) + RESOLV_CONF="/etc/rancher/k3s/resolv.conf" has_default_route() { @@ -74,11 +129,11 @@ setup_dns_proxy() { # Docker sets up rules like: # -A DOCKER_OUTPUT -d 127.0.0.11/32 -p udp --dport 53 -j DNAT --to-destination 127.0.0.11: # -A DOCKER_OUTPUT -d 127.0.0.11/32 -p tcp --dport 53 -j DNAT --to-destination 127.0.0.11: - UDP_PORT=$(iptables -t nat -S DOCKER_OUTPUT 2>/dev/null \ + UDP_PORT=$($IPTABLES -t nat -S DOCKER_OUTPUT 2>/dev/null \ | grep -- '-p udp.*--dport 53' \ | sed -n 's/.*--to-destination 127.0.0.11:\([0-9]*\).*/\1/p' \ | head -1) - TCP_PORT=$(iptables -t nat -S DOCKER_OUTPUT 2>/dev/null \ + TCP_PORT=$($IPTABLES -t nat -S DOCKER_OUTPUT 2>/dev/null \ | grep -- '-p tcp.*--dport 53' \ | sed -n 's/.*--to-destination 127.0.0.11:\([0-9]*\).*/\1/p' \ | head -1) @@ -101,9 +156,9 @@ setup_dns_proxy() { echo "Setting up DNS proxy: ${CONTAINER_IP}:53 -> 127.0.0.11 (udp:${UDP_PORT}, tcp:${TCP_PORT})" # Forward DNS from pods (PREROUTING) and local processes (OUTPUT) to Docker's DNS - iptables -t nat -I PREROUTING -p udp --dport 53 -d "$CONTAINER_IP" -j DNAT \ + $IPTABLES -t nat -I PREROUTING -p udp --dport 53 -d "$CONTAINER_IP" -j DNAT \ --to-destination "127.0.0.11:${UDP_PORT}" - iptables -t nat -I PREROUTING -p tcp --dport 53 -d "$CONTAINER_IP" -j DNAT \ + $IPTABLES -t nat -I PREROUTING -p tcp --dport 53 -d "$CONTAINER_IP" -j DNAT \ --to-destination "127.0.0.11:${TCP_PORT}" echo "nameserver $CONTAINER_IP" > "$RESOLV_CONF" @@ -495,6 +550,13 @@ if [ ! -f /sys/fs/cgroup/cgroup.controllers ]; then EXTRA_KUBELET_ARGS="--kubelet-arg=fail-cgroupv1=false" fi +# On kernels where xt_comment is unavailable, kube-router's network policy +# controller panics at startup. Disable it when the iptables-legacy probe +# triggered; sandbox isolation is enforced by the NSSH1 HMAC handshake instead. +if [ "${USE_IPTABLES_LEGACY:-0}" = "1" ]; then + EXTRA_KUBELET_ARGS="$EXTRA_KUBELET_ARGS --disable-network-policy" +fi + # Docker Desktop can briefly start the container before its bridge default route # is fully installed. k3s exits immediately in that state, so wait briefly for # routing to settle first.