Group J — closes #127, #128, #129, #130, #131, #132. Defaults - control_plane_size default cx42 (16 GB) — cx32 (8 GB) is INSUFFICIENT for a solo Sovereign per PLATFORM-TECH-STACK.md §7.1 (~11.3 GB Catalyst) + §7.4 (~8.8 GB per-host-cluster) = ~20 GB minimum. The previous cx32 default would OOM during the OpenBao + Keycloak step of bootstrap. - New k3s_version variable (v1.31.4+k3s1) — pinned, validated against the INSTALL_K3S_VERSION format. Previously hardcoded inside the cloud-init templates, in violation of INVIOLABLE-PRINCIPLES.md §4. Validation - Region restricted to the 5 known Hetzner locations. - control_plane_size + worker_size restricted to the cxNN | ccxNN | caxNN namespace (blocks tiny dev sizes that would OOM at runtime). - k3s_version regex matches the upstream installer's version format. - ssh_allowed_cidrs validated as proper CIDRs. Firewall - Document each open port (80, 443, 6443, ICMP) and each blocked port (22, 10250, 2379/2380, 8472) in README.md §"Firewall rules". - SSH (22) is now a dynamic rule keyed off ssh_allowed_cidrs (default empty = no SSH at the firewall, break-glass via Hetzner Console). OS hardening (cloudinit-*.tftpl) - sshd drop-in: PasswordAuthentication no, PermitRootLogin prohibit-password, no forwarding, MaxAuthTries=3, LoginGraceTime=30. - enable_unattended_upgrades (default true): security-only pocket, auto-reboot at 02:30, removes unused kernels. - enable_fail2ban (default true): sshd jail, systemd backend. - Both control-plane and worker templates carry the same baseline. Documentation - New infra/hetzner/README.md (operator-facing) covers: * What the module creates + Phase-0/Phase-1 boundary. * Sizing rationale with the §7.1+§7.4 RAM math + upgrade path. * Firewall rules: every open port, every blocked port, every deliberate egress flow. * k3s flag-by-flag rationale tied to PLATFORM-TECH-STACK.md §8. * SSH key management: why no auto-generated keys (break-glass + audit-trail + custody + compliance). * OS hardening table. * Standalone CLI invocation pattern (tofu apply -var-file=...). * What the module does NOT do (Crossplane / Flux territory). Closes #127 #128 #129 #130 #131 #132
203 lines
8.5 KiB
Plaintext
203 lines
8.5 KiB
Plaintext
#cloud-config
|
|
# Catalyst Sovereign control-plane bootstrap.
|
|
# Sovereign: ${sovereign_fqdn}
|
|
# Provisioned by: catalyst-provisioner (https://console.openova.io/sovereign)
|
|
#
|
|
# This script:
|
|
# 1. Installs OS hardening (SSH password-auth off, fail2ban, unattended-upgrades).
|
|
# 2. Installs k3s with --flannel-backend=none (Cilium replaces it).
|
|
# 3. Installs Flux + bootstraps the GitRepository pointing at clusters/${sovereign_fqdn}/
|
|
# in the public OpenOva monorepo. From this point Flux is the GitOps
|
|
# reconciler and installs the 11-component bootstrap kit
|
|
# (Cilium → cert-manager → Crossplane → ... → bp-catalyst-platform) in
|
|
# dependency order via Kustomizations the cluster directory ships.
|
|
# 4. Touches /var/lib/catalyst/cloud-init-complete so the catalyst-api
|
|
# provisioner can detect cloud-init has finished.
|
|
|
|
package_update: true
|
|
package_upgrade: false
|
|
packages:
|
|
- curl
|
|
- iptables
|
|
- jq
|
|
- ca-certificates
|
|
- git
|
|
%{ if enable_fail2ban ~}
|
|
- fail2ban
|
|
%{ endif ~}
|
|
%{ if enable_unattended_upgrades ~}
|
|
- unattended-upgrades
|
|
- apt-listchanges
|
|
%{ endif ~}
|
|
|
|
write_files:
|
|
- path: /var/lib/catalyst/sovereign.json
|
|
permissions: '0644'
|
|
content: |
|
|
{
|
|
"sovereignFQDN": "${sovereign_fqdn}",
|
|
"sovereignSubdomain": "${sovereign_subdomain}",
|
|
"orgName": ${jsonencode(org_name)},
|
|
"orgEmail": ${jsonencode(org_email)},
|
|
"region": "${region}",
|
|
"haEnabled": ${ha_enabled},
|
|
"workerCount": ${worker_count},
|
|
"k3sVersion": "${k3s_version}",
|
|
"gitopsRepoUrl": "${gitops_repo_url}",
|
|
"gitopsBranch": "${gitops_branch}"
|
|
}
|
|
|
|
# ── OS hardening: SSH daemon ──────────────────────────────────────────
|
|
# Drop-in overrides /etc/ssh/sshd_config defaults. Per Catalyst's threat
|
|
# model the operator's only valid path in is the Hetzner-project SSH key
|
|
# injected via cloud-init authorized_keys. Password auth, KbdInteractive,
|
|
# and root password login are all off.
|
|
- path: /etc/ssh/sshd_config.d/99-catalyst-hardening.conf
|
|
permissions: '0644'
|
|
content: |
|
|
# Managed by Catalyst Sovereign cloud-init — do not edit by hand.
|
|
PasswordAuthentication no
|
|
KbdInteractiveAuthentication no
|
|
ChallengeResponseAuthentication no
|
|
PermitRootLogin prohibit-password
|
|
PermitEmptyPasswords no
|
|
UsePAM yes
|
|
X11Forwarding no
|
|
AllowAgentForwarding no
|
|
AllowTcpForwarding no
|
|
ClientAliveInterval 300
|
|
ClientAliveCountMax 2
|
|
MaxAuthTries 3
|
|
LoginGraceTime 30
|
|
|
|
%{ if enable_unattended_upgrades ~}
|
|
# ── Unattended security upgrades ──────────────────────────────────────
|
|
# Ubuntu's stock unattended-upgrades, restricted to the security pocket.
|
|
# Runs daily, reboots automatically at 02:30 if a kernel upgrade requires
|
|
# it (k3s tolerates single-node restarts on a solo Sovereign within the
|
|
# ~60s window the Hetzner LB health-check covers).
|
|
- path: /etc/apt/apt.conf.d/20auto-upgrades
|
|
permissions: '0644'
|
|
content: |
|
|
APT::Periodic::Update-Package-Lists "1";
|
|
APT::Periodic::Unattended-Upgrade "1";
|
|
APT::Periodic::AutocleanInterval "7";
|
|
- path: /etc/apt/apt.conf.d/52unattended-upgrades-catalyst
|
|
permissions: '0644'
|
|
content: |
|
|
Unattended-Upgrade::Allowed-Origins {
|
|
"$${distro_id}:$${distro_codename}-security";
|
|
"$${distro_id}ESMApps:$${distro_codename}-apps-security";
|
|
"$${distro_id}ESM:$${distro_codename}-infra-security";
|
|
};
|
|
Unattended-Upgrade::Automatic-Reboot "true";
|
|
Unattended-Upgrade::Automatic-Reboot-Time "02:30";
|
|
Unattended-Upgrade::Remove-Unused-Kernel-Packages "true";
|
|
Unattended-Upgrade::Remove-Unused-Dependencies "true";
|
|
%{ endif ~}
|
|
|
|
%{ if enable_fail2ban ~}
|
|
# ── fail2ban: sshd jail ───────────────────────────────────────────────
|
|
# Even though SSH is firewalled to ssh_allowed_cidrs (or fully closed at
|
|
# the firewall), fail2ban remains a defence-in-depth layer for the case
|
|
# where the firewall rule is widened by an operator post-bootstrap.
|
|
- path: /etc/fail2ban/jail.d/catalyst-sshd.local
|
|
permissions: '0644'
|
|
content: |
|
|
[sshd]
|
|
enabled = true
|
|
port = ssh
|
|
filter = sshd
|
|
maxretry = 5
|
|
findtime = 10m
|
|
bantime = 1h
|
|
backend = systemd
|
|
%{ endif ~}
|
|
|
|
# Flux GitRepository + Kustomization that take over after k3s is up.
|
|
# The clusters/${sovereign_fqdn}/ directory in the public OpenOva monorepo
|
|
# contains a Kustomization tree that installs the 11-component bootstrap
|
|
# kit + bp-catalyst-platform umbrella in dependency order.
|
|
- path: /var/lib/catalyst/flux-bootstrap.yaml
|
|
permissions: '0644'
|
|
content: |
|
|
apiVersion: source.toolkit.fluxcd.io/v1
|
|
kind: GitRepository
|
|
metadata:
|
|
name: openova
|
|
namespace: flux-system
|
|
spec:
|
|
interval: 1m
|
|
url: ${gitops_repo_url}
|
|
ref:
|
|
branch: ${gitops_branch}
|
|
ignore: |
|
|
/*
|
|
!/clusters/${sovereign_fqdn}
|
|
!/platform
|
|
!/products
|
|
---
|
|
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
|
kind: Kustomization
|
|
metadata:
|
|
name: catalyst-bootstrap
|
|
namespace: flux-system
|
|
spec:
|
|
interval: 5m
|
|
path: ./clusters/${sovereign_fqdn}
|
|
prune: true
|
|
sourceRef:
|
|
kind: GitRepository
|
|
name: openova
|
|
wait: true
|
|
timeout: 30m
|
|
|
|
runcmd:
|
|
- swapoff -a
|
|
- sed -i '/swap/d' /etc/fstab
|
|
- update-alternatives --set iptables /usr/sbin/iptables-legacy || true
|
|
- update-alternatives --set ip6tables /usr/sbin/ip6tables-legacy || true
|
|
|
|
# Activate hardened sshd config (cloud-init may have written authorized_keys
|
|
# already from Hetzner ssh_keys[]; we never touch that file).
|
|
- systemctl reload ssh || systemctl reload sshd || true
|
|
%{ if enable_fail2ban ~}
|
|
- systemctl enable --now fail2ban
|
|
%{ endif ~}
|
|
%{ if enable_unattended_upgrades ~}
|
|
- systemctl enable --now unattended-upgrades
|
|
%{ endif ~}
|
|
|
|
# k3s control-plane. Flags per docs/SOVEREIGN-PROVISIONING.md §3 and
|
|
# docs/PLATFORM-TECH-STACK.md §8.1:
|
|
# --cluster-init Initialise embedded etcd (HA-ready).
|
|
# --flannel-backend=none Cilium replaces flannel.
|
|
# --disable=traefik Cilium Gateway replaces traefik.
|
|
# --disable=servicelb Hetzner LB handles ingress.
|
|
# --disable=local-storage Crossplane-provisioned hcloud-csi instead.
|
|
# --disable-network-policy Cilium handles NetworkPolicy.
|
|
# --tls-san=${sovereign_fqdn} API server cert valid for the sovereign FQDN.
|
|
- 'curl -sfL https://get.k3s.io | INSTALL_K3S_VERSION=${k3s_version} K3S_TOKEN=${k3s_token} INSTALL_K3S_EXEC="server --cluster-init --flannel-backend=none --disable-network-policy --disable=traefik --disable=servicelb --disable=local-storage --tls-san=${sovereign_fqdn} --node-label catalyst.openova.io/role=control-plane --write-kubeconfig-mode=0644" sh -'
|
|
|
|
# Wait for the API server to be reachable. Cilium needs to come up before
|
|
# nodes Ready, so we wait specifically for the API endpoint.
|
|
- 'until kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml get --raw /healthz; do sleep 5; done'
|
|
|
|
# Install Flux core. Flux is the FIRST and ONLY in-cluster orchestrator —
|
|
# everything else (Cilium, cert-manager, Crossplane, ...) gets installed by
|
|
# Flux reconciling clusters/${sovereign_fqdn}/. Per INVIOLABLE-PRINCIPLES.md
|
|
# principle #3: Flux is the GitOps engine, no exec helm/kubectl from outside.
|
|
- 'curl -fsSL https://github.com/fluxcd/flux2/releases/download/v2.4.0/install.yaml | kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml apply -f -'
|
|
- 'kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml -n flux-system wait --for=condition=Available --timeout=300s deployment --all'
|
|
|
|
# Apply the Flux bootstrap GitRepository + Kustomization. From here, Flux
|
|
# owns the cluster: pulls clusters/${sovereign_fqdn}/, installs Cilium
|
|
# via bp-cilium, cert-manager via bp-cert-manager, etc., then bp-catalyst-platform.
|
|
- 'kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml apply -f /var/lib/catalyst/flux-bootstrap.yaml'
|
|
|
|
# Marker for the catalyst-api provisioner to detect cloud-init is done.
|
|
- mkdir -p /var/lib/catalyst
|
|
- touch /var/lib/catalyst/cloud-init-complete
|
|
|
|
final_message: "Catalyst control-plane bootstrap complete after $UPTIME seconds — Flux is now reconciling clusters/${sovereign_fqdn}/"
|