openova/infra/hetzner/cloudinit-control-plane.tftpl

#cloud-config
# Catalyst Sovereign control-plane bootstrap.
# Sovereign: ${sovereign_fqdn}
# Provisioned by: catalyst-provisioner (https://console.openova.io/sovereign)
#
# This script:
#   1. Installs OS hardening (SSH password-auth off, fail2ban, unattended-upgrades).
#   2. Installs k3s with --flannel-backend=none (Cilium replaces it).
#   3. Installs Flux + bootstraps the GitRepository pointing at clusters/${sovereign_fqdn}/
#      in the public OpenOva monorepo. From this point Flux is the GitOps
#      reconciler and installs the 11-component bootstrap kit
#      (Cilium → cert-manager → Crossplane → ... → bp-catalyst-platform) in
#      dependency order via Kustomizations the cluster directory ships.
#   4. Touches /var/lib/catalyst/cloud-init-complete so the catalyst-api
#      provisioner can detect cloud-init has finished.

package_update: true
package_upgrade: false
packages:
  - curl
  - iptables
  - jq
  - ca-certificates
  - git
%{ if enable_fail2ban ~}
  - fail2ban
%{ endif ~}
%{ if enable_unattended_upgrades ~}
  - unattended-upgrades
  - apt-listchanges
%{ endif ~}

write_files:
  - path: /var/lib/catalyst/sovereign.json
    permissions: '0644'
    content: |
      {
        "sovereignFQDN": "${sovereign_fqdn}",
        "sovereignSubdomain": "${sovereign_subdomain}",
        "orgName": ${jsonencode(org_name)},
        "orgEmail": ${jsonencode(org_email)},
        "region": "${region}",
        "haEnabled": ${ha_enabled},
        "workerCount": ${worker_count},
        "k3sVersion": "${k3s_version}",
        "gitopsRepoUrl": "${gitops_repo_url}",
        "gitopsBranch": "${gitops_branch}"
      }

  # ── OS hardening: SSH daemon ──────────────────────────────────────────
  # Drop-in overrides /etc/ssh/sshd_config defaults. Per Catalyst's threat
  # model the operator's only valid path in is the Hetzner-project SSH key
  # injected via cloud-init authorized_keys. Password auth, KbdInteractive,
  # and root password login are all off.
  - path: /etc/ssh/sshd_config.d/99-catalyst-hardening.conf
    permissions: '0644'
    content: |
      # Managed by Catalyst Sovereign cloud-init — do not edit by hand.
      PasswordAuthentication no
      KbdInteractiveAuthentication no
      ChallengeResponseAuthentication no
      PermitRootLogin prohibit-password
      PermitEmptyPasswords no
      UsePAM yes
      X11Forwarding no
      AllowAgentForwarding no
      AllowTcpForwarding no
      ClientAliveInterval 300
      ClientAliveCountMax 2
      MaxAuthTries 3
      LoginGraceTime 30

%{ if enable_unattended_upgrades ~}
  # ── Unattended security upgrades ──────────────────────────────────────
  # Ubuntu's stock unattended-upgrades, restricted to the security pocket.
  # Runs daily, reboots automatically at 02:30 if a kernel upgrade requires
  # it (k3s tolerates single-node restarts on a solo Sovereign within the
  # ~60s window the Hetzner LB health-check covers).
  - path: /etc/apt/apt.conf.d/20auto-upgrades
    permissions: '0644'
    content: |
      APT::Periodic::Update-Package-Lists "1";
      APT::Periodic::Unattended-Upgrade "1";
      APT::Periodic::AutocleanInterval "7";
  - path: /etc/apt/apt.conf.d/52unattended-upgrades-catalyst
    permissions: '0644'
    content: |
      Unattended-Upgrade::Allowed-Origins {
        "$${distro_id}:$${distro_codename}-security";
        "$${distro_id}ESMApps:$${distro_codename}-apps-security";
        "$${distro_id}ESM:$${distro_codename}-infra-security";
      };
      Unattended-Upgrade::Automatic-Reboot "true";
      Unattended-Upgrade::Automatic-Reboot-Time "02:30";
      Unattended-Upgrade::Remove-Unused-Kernel-Packages "true";
      Unattended-Upgrade::Remove-Unused-Dependencies "true";
%{ endif ~}

%{ if enable_fail2ban ~}
  # ── fail2ban: sshd jail ───────────────────────────────────────────────
  # Even though SSH is firewalled to ssh_allowed_cidrs (or fully closed at
  # the firewall), fail2ban remains a defence-in-depth layer for the case
  # where the firewall rule is widened by an operator post-bootstrap.
  - path: /etc/fail2ban/jail.d/catalyst-sshd.local
    permissions: '0644'
    content: |
      [sshd]
      enabled = true
      port = ssh
      filter = sshd
      maxretry = 5
      findtime = 10m
      bantime = 1h
      backend = systemd
%{ endif ~}

  # ── flux-system/ghcr-pull Secret ─────────────────────────────────────
  #
  # Every HelmRepository CR in clusters/${sovereign_fqdn}/bootstrap-kit/
  # references `secretRef: name: ghcr-pull` because the bp-* OCI artifacts
  # at `ghcr.io/openova-io/` are PRIVATE. Without this Secret, the
  # source-controller logs:
  #
  #   failed to get authentication secret 'flux-system/ghcr-pull':
  #     secrets "ghcr-pull" not found
  #
  # …and Phase 1 stalls at bp-cilium. The operator workaround (kubectl
  # apply the Secret by hand after Flux installs) is not durable across
  # re-provisioning of the same Sovereign — every fresh control-plane
  # boots without the Secret.
  #
  # We write the Secret into flux-system at cloud-init time, BEFORE
  # /var/lib/catalyst/flux-bootstrap.yaml is applied, so the GitRepository +
  # Kustomization land into a cluster that already has working GHCR creds.
  # The apply step is in runcmd: below; the manifest itself lives here.
  #
  # Token rotation policy: yearly, stored in 1Password under
  # "Catalyst — GHCR pull token (catalyst-ghcr-pull-token)". See
  # docs/SECRET-ROTATION.md. The token NEVER lives in git.
  - path: /var/lib/catalyst/ghcr-pull-secret.yaml
    permissions: '0600'
    content: |
      apiVersion: v1
      kind: Secret
      metadata:
        name: ghcr-pull
        namespace: flux-system
      type: kubernetes.io/dockerconfigjson
      data:
        .dockerconfigjson: ${base64encode(jsonencode({
          auths = {
            "ghcr.io" = {
              username = ghcr_pull_username
              password = ghcr_pull_token
              auth     = ghcr_pull_auth_b64
            }
          }
        }))}

  # Flux GitRepository + Kustomization that take over after k3s is up.
  # The clusters/${sovereign_fqdn}/ directory in the public OpenOva monorepo
  # contains a Kustomization tree that installs the 11-component bootstrap
  # kit + bp-catalyst-platform umbrella in dependency order.
  - path: /var/lib/catalyst/flux-bootstrap.yaml
    permissions: '0644'
    content: |
      apiVersion: source.toolkit.fluxcd.io/v1
      kind: GitRepository
      metadata:
        name: openova
        namespace: flux-system
      spec:
        interval: 1m
        url: ${gitops_repo_url}
        ref:
          branch: ${gitops_branch}
        ignore: |
          /*
          !/clusters/${sovereign_fqdn}
          !/platform
          !/products
      ---
      # Two Flux Kustomizations with dependsOn so Crossplane CRDs land
      # before any resource that uses them is dry-run-applied.
      #
      # bootstrap-kit installs the 11 HelmReleases (Cilium, cert-manager,
      # Flux, Crossplane core, sealed-secrets, SPIRE, NATS-JetStream,
      # OpenBao, Keycloak, Gitea, bp-catalyst-platform). bp-crossplane
      # registers the Crossplane core CRDs (Provider, ProviderConfig…)
      # AND the bp-catalyst-platform umbrella reconciles the rest.
      #
      # infrastructure-config applies the cluster's Provider package +
      # ProviderConfig + Compositions. Because it dependsOn bootstrap-kit
      # AND uses wait: true, Flux waits until bootstrap-kit's HelmReleases
      # are Ready (Crossplane core + provider-hcloud installed,
      # hcloud.crossplane.io/v1beta1 CRDs registered) before dry-running
      # ProviderConfig — which is the exact ordering the prior single-
      # Kustomization model tripped over with:
      #   no matches for kind "ProviderConfig" in version
      #   "hcloud.crossplane.io/v1beta1"
      apiVersion: kustomize.toolkit.fluxcd.io/v1
      kind: Kustomization
      metadata:
        name: bootstrap-kit
        namespace: flux-system
      spec:
        interval: 5m
        path: ./clusters/${sovereign_fqdn}/bootstrap-kit
        prune: true
        sourceRef:
          kind: GitRepository
          name: openova
        wait: true
        timeout: 30m
      ---
      apiVersion: kustomize.toolkit.fluxcd.io/v1
      kind: Kustomization
      metadata:
        name: infrastructure-config
        namespace: flux-system
      spec:
        interval: 5m
        path: ./clusters/${sovereign_fqdn}/infrastructure
        prune: true
        sourceRef:
          kind: GitRepository
          name: openova
        dependsOn:
          - name: bootstrap-kit
        wait: true
        timeout: 30m

runcmd:
  - swapoff -a
  - sed -i '/swap/d' /etc/fstab
  - update-alternatives --set iptables /usr/sbin/iptables-legacy || true
  - update-alternatives --set ip6tables /usr/sbin/ip6tables-legacy || true

  # Activate hardened sshd config (cloud-init may have written authorized_keys
  # already from Hetzner ssh_keys[]; we never touch that file).
  - systemctl reload ssh || systemctl reload sshd || true
%{ if enable_fail2ban ~}
  - systemctl enable --now fail2ban
%{ endif ~}
%{ if enable_unattended_upgrades ~}
  - systemctl enable --now unattended-upgrades
%{ endif ~}

  # k3s control-plane. Flags per docs/SOVEREIGN-PROVISIONING.md §3 and
  # docs/PLATFORM-TECH-STACK.md §8.1:
  #   --cluster-init                Initialise embedded etcd (HA-ready).
  #   --flannel-backend=none        Cilium replaces flannel.
  #   --disable=traefik             Cilium Gateway replaces traefik.
  #   --disable=servicelb           Hetzner LB handles ingress.
  #   --disable=local-storage       Crossplane-provisioned hcloud-csi instead.
  #   --disable-network-policy      Cilium handles NetworkPolicy.
  #   --tls-san=${sovereign_fqdn}   API server cert valid for the sovereign FQDN.
  - 'curl -sfL https://get.k3s.io | INSTALL_K3S_VERSION=${k3s_version} K3S_TOKEN=${k3s_token} INSTALL_K3S_EXEC="server --cluster-init --flannel-backend=none --disable-network-policy --disable=traefik --disable=servicelb --disable=local-storage --tls-san=${sovereign_fqdn} --node-label catalyst.openova.io/role=control-plane --write-kubeconfig-mode=0644" sh -'

  # Wait for the API server to be reachable. Cilium needs to come up before
  # nodes Ready, so we wait specifically for the API endpoint.
  - 'until kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml get --raw /healthz; do sleep 5; done'

%{ if deployment_id != "" && kubeconfig_bearer_token != "" && catalyst_api_url != "" ~}
  # ── Cloud-init kubeconfig postback (issue #183, Option D) ───────────────
  #
  # The k3s install above wrote /etc/rancher/k3s/k3s.yaml with the API
  # server URL pinned to https://127.0.0.1:6443 — kubectl's default for a
  # local single-node install. catalyst-api lives off-cluster (Catalyst-Zero
  # franchise console on contabo-mkt) and cannot reach 127.0.0.1 on this
  # node, so we MUST rewrite that field before sending the kubeconfig
  # back. The Hetzner load balancer at $${load_balancer_ipv4} forwards
  # 6443 to the control plane's 6443 (firewall rule above), so a kubeconfig
  # pointing at the LB's public IPv4 is reachable from anywhere.
  #
  # Plaintext: we read from /etc/rancher/k3s/k3s.yaml (mode 0644 written
  # by k3s), apply the rewrite via sed, write the result to
  # /etc/rancher/k3s/k3s.yaml.public (mode 0600 explicitly), then
  # curl --data-binary the file content to catalyst-api with the bearer
  # token. The .public file is removed at the end of the runcmd block
  # so the bearer-protected kubeconfig only lives on this node for the
  # few seconds it takes to PUT.
  #
  # --retry 60 --retry-delay 10 --retry-all-errors handles the case
  # where catalyst-api is briefly unreachable (image roll, ingress
  # reconciliation) — the cloud-init runcmd budget is bounded by the
  # systemd cloud-final timeout (~30 minutes).
  - install -m 0600 /dev/null /etc/rancher/k3s/k3s.yaml.public
  - sed 's|https://127.0.0.1:6443|https://${load_balancer_ipv4}:6443|g' /etc/rancher/k3s/k3s.yaml > /etc/rancher/k3s/k3s.yaml.public
  - chmod 0600 /etc/rancher/k3s/k3s.yaml.public
  - |
    curl -fsSL --retry 60 --retry-delay 10 --retry-all-errors \
      -X PUT \
      -H "Authorization: Bearer ${kubeconfig_bearer_token}" \
      -H "Content-Type: application/x-yaml" \
      --data-binary @/etc/rancher/k3s/k3s.yaml.public \
      ${catalyst_api_url}/api/v1/deployments/${deployment_id}/kubeconfig
  - rm -f /etc/rancher/k3s/k3s.yaml.public
%{ endif ~}

  # ── Cilium FIRST (before Flux) ───────────────────────────────────────────
  #
  # k3s started with --flannel-backend=none, so the cluster has NO CNI yet.
  # If we apply Flux install.yaml at this point, the Flux controller pods
  # stay Pending forever — kubelet rejects them with
  #   "container runtime network not ready: cni plugin not initialized"
  # Flux is then unable to reconcile bp-cilium, so Cilium is never
  # installed → bootstrap deadlock that we hit in production at
  # omantel.omani.works deployment 5cd1bceaaacb71f6 (25 min stuck Pending).
  #
  # Bootstrap chicken-and-egg: Cilium IS the install unit (bp-cilium), but
  # Flux needs a CNI to run, and Cilium IS the CNI. Resolution: install
  # Cilium ONCE here via Helm with the same chart + values bp-cilium would
  # apply later. When Flux reconciles bp-cilium, it adopts the existing
  # release (Helm release-name match), so there is no churn.
  #
  # Per INVIOLABLE-PRINCIPLES.md #3 the GitOps engine is Flux — this Helm
  # install is the one-shot bootstrap exception explicitly authorised by
  # the same principle's "everything ELSE" qualifier. The chart version
  # matches platform/cilium/blueprint.yaml's chartVersion to keep the
  # bootstrap install and the reconciled HelmRelease byte-identical.
  - 'curl -sSL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash'
  - 'helm repo add cilium https://helm.cilium.io/'
  - 'helm repo update'
  - |
    KUBECONFIG=/etc/rancher/k3s/k3s.yaml helm install cilium cilium/cilium \
      --version 1.16.5 \
      --namespace kube-system \
      --set kubeProxyReplacement=true \
      --set k8sServiceHost=127.0.0.1 \
      --set k8sServicePort=6443 \
      --set ipam.mode=kubernetes \
      --set tunnelProtocol=vxlan \
      --set bpf.masquerade=true
  - 'kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml -n kube-system rollout status ds/cilium --timeout=240s'

  # Install Flux core. Cilium is now the cluster's CNI, so Flux pods will
  # actually start. Flux then reconciles clusters/${sovereign_fqdn}/ which
  # adopts the Helm release above as bp-cilium and continues with
  # bp-cert-manager, bp-flux (host-level Flux, distinct from this Flux
  # which is the CONTROL-PLANE Flux), bp-crossplane, etc.
  - 'curl -fsSL https://github.com/fluxcd/flux2/releases/download/v2.4.0/install.yaml | kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml apply -f -'
  - 'kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml -n flux-system wait --for=condition=Available --timeout=300s deployment --all'

  # ── flux-system/ghcr-pull Secret (applied BEFORE GitRepository) ──────
  #
  # Apply the docker-registry pull secret rendered above. This MUST land
  # before the GitRepository + Kustomization in flux-bootstrap.yaml,
  # because the bootstrap-kit Kustomization includes HelmRepository CRs
  # that reference this Secret by name; the source-controller resolves
  # them on its first reconciliation tick and a missing Secret propagates
  # as a Ready=False/AuthError state that has been observed to persist
  # for 5+ minutes even after the Secret is later applied.
  #
  # Idempotent: `kubectl apply` against an existing Secret is a no-op
  # when the manifest's bytes match. A reprovision (same Sovereign FQDN)
  # rewrites this with the same content; a token rotation propagates
  # through here on the next cloud-init render.
  - 'kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml apply -f /var/lib/catalyst/ghcr-pull-secret.yaml'

  # Apply the Flux bootstrap GitRepository + Kustomization. From here, Flux
  # owns the cluster: pulls clusters/${sovereign_fqdn}/, installs Cilium
  # via bp-cilium, cert-manager via bp-cert-manager, etc., then bp-catalyst-platform.
  - 'kubectl --kubeconfig=/etc/rancher/k3s/k3s.yaml apply -f /var/lib/catalyst/flux-bootstrap.yaml'

  # Marker for the catalyst-api provisioner to detect cloud-init is done.
  - mkdir -p /var/lib/catalyst
  - touch /var/lib/catalyst/cloud-init-complete

final_message: "Catalyst control-plane bootstrap complete after $UPTIME seconds — Flux is now reconciling clusters/${sovereign_fqdn}/"