#!/usr/bin/env bash # operator-recover-sovereign.sh — idempotent recovery of a partially-provisioned Sovereign. # # When a Sovereign provisioning run fails partway (Phase 0 OpenTofu, Phase 1 # bootstrap-kit, or anywhere in between), this script returns the system to a # clean slate so the operator can re-run Launch in the wizard with the same # FQDN. It does THREE things, in order: # # 1. Purge every Hetzner Cloud resource tagged for that Sovereign. # 2. Release the PDM allocation for the Sovereign's pool subdomain (if any). # 3. Mark the catalyst-api deployment record as `cancelled` so the wizard # stops streaming events for it and the operator can re-create cleanly. # # Resource names in the OpenTofu module are deterministic: # catalyst-${replace(sovereign_fqdn, ".", "-")}-{role} # so re-running Launch with the same FQDN after this script is fully idempotent. # # DRY-RUN by default. Pass --apply to actually delete. # # Anchored to the canonical purge logic in: # /home/openova/.claude/projects/-home-openova-repos-openova-private/memory/feedback_idempotent_iac_purge.md # and the runbook at: # docs/RUNBOOK-OPERATIONS.md §"Recovery procedure" # # Usage: # ./scripts/operator-recover-sovereign.sh # dry-run # ./scripts/operator-recover-sovereign.sh --apply # destructive # # Required tools: bash, curl, python3, kubectl # Required env vars: HETZNER_API_TOKEN (read+write project token) # Optional env vars: PDM_BASE_URL (default: derived from --pool-domain or omani.works) # POOL_DOMAIN (default: derived from FQDN's parent zone) # CATALYST_NAMESPACE (default: catalyst) set -euo pipefail # ── Argument parsing ────────────────────────────────────────────────── FQDN="${1:-}" MODE="dry-run" shift || true while [ "$#" -gt 0 ]; do case "$1" in --apply) MODE="apply" ;; --dry-run) MODE="dry-run" ;; *) echo "ERR: unknown flag: $1" >&2; exit 2 ;; esac shift done if [ -z "$FQDN" ]; then echo "Usage: $0 [--apply]" >&2 echo "Example: $0 omantel.omani.works --apply" >&2 exit 2 fi # Slug used by the OpenTofu module to name resources, matching: # catalyst-${replace(sovereign_fqdn, ".", "-")}-{role} SLUG=$(echo "$FQDN" | tr . -) LABEL_KEY="catalyst.openova.io/sovereign" NS="${CATALYST_NAMESPACE:-catalyst}" # Pool domain inference: the parent zone of the FQDN. # omantel.omani.works -> omani.works # acme.openova.io -> openova.io # acme.bank.com -> bank.com (would only matter if PDM manages it) POOL_DOMAIN="${POOL_DOMAIN:-$(echo "$FQDN" | cut -d. -f2-)}" # ── Output helpers ──────────────────────────────────────────────────── bold() { printf "\033[1m%s\033[0m\n" "$*"; } red() { printf "\033[31m%s\033[0m\n" "$*"; } green() { printf "\033[32m%s\033[0m\n" "$*"; } yellow() { printf "\033[33m%s\033[0m\n" "$*"; } cyan() { printf "\033[36m%s\033[0m\n" "$*"; } prefix() { if [ "$MODE" = "dry-run" ]; then yellow "[DRY-RUN] $*" else cyan "[APPLY] $*" fi } # ── Banner ──────────────────────────────────────────────────────────── bold "===================================================================" bold " OpenOva Catalyst — Operator Sovereign Recovery" bold "===================================================================" echo " Sovereign FQDN: $FQDN" echo " Resource slug: catalyst-${SLUG}-*" echo " Hetzner label: ${LABEL_KEY}=${FQDN}" echo " Pool parent zone: ${POOL_DOMAIN}" echo " Catalyst NS: ${NS}" echo " Mode: $MODE" bold "===================================================================" echo if [ "$MODE" = "dry-run" ]; then yellow " Running in DRY-RUN mode. Nothing will be deleted." yellow " Re-run with --apply to actually purge resources." else red " Running in APPLY mode. Resources WILL be deleted. CTRL-C now to abort." sleep 3 fi echo # ── Pre-flight ──────────────────────────────────────────────────────── # In dry-run mode we tolerate a missing/invalid token — the operator is just # previewing what would happen. In apply mode we hard-fail. HAVE_HETZNER_TOKEN=0 if [ -n "${HETZNER_API_TOKEN:-}" ]; then HTTP_CODE=$(curl -sS -o /dev/null -w '%{http_code}' \ -H "Authorization: Bearer ${HETZNER_API_TOKEN}" \ "https://api.hetzner.cloud/v1/servers?per_page=1" || true) if [ "$HTTP_CODE" = "200" ]; then HAVE_HETZNER_TOKEN=1 green " HETZNER_API_TOKEN validated — Hetzner inventory will be queried live." else if [ "$MODE" = "apply" ]; then red "ERR: HETZNER_API_TOKEN rejected by Hetzner API (HTTP $HTTP_CODE). Aborting." exit 3 else yellow " HETZNER_API_TOKEN rejected by Hetzner (HTTP $HTTP_CODE) — Step 1 will be a name-only preview." fi fi else if [ "$MODE" = "apply" ]; then red "ERR: HETZNER_API_TOKEN is not set. Export the read+write token for the Sovereign's project." exit 3 else yellow " HETZNER_API_TOKEN is not set — Step 1 will be a name-only preview (set the token to enumerate resources for real)." fi fi echo # ── Step 1 — Hetzner purge ──────────────────────────────────────────── bold "── Step 1 / 3 — Hetzner Cloud resource purge ─────────────────────" H="Authorization: Bearer ${HETZNER_API_TOKEN:-NO_TOKEN}" SEL="label_selector=${LABEL_KEY}=${FQDN}" KINDS_LABELED="servers load_balancers networks firewalls volumes primary_ips floating_ips" list_ids_labelled() { local kind="$1" curl -sS -H "$H" "https://api.hetzner.cloud/v1/${kind}?${SEL}" | python3 -c "import json,sys; d=json.load(sys.stdin); [print(s['id'], s.get('name','')) for s in d.get('${kind}',[])]" } list_ids_by_name() { # Hetzner ssh_keys (and a few other kinds) don't accept label selectors. # Match by deterministic resource-name slug instead. local kind="$1" curl -sS -H "$H" "https://api.hetzner.cloud/v1/${kind}" | python3 -c " import json, sys d = json.load(sys.stdin) items = d.get('${kind}', []) slug = '${SLUG}' for s in items: name = s.get('name', '') if slug in name: print(s['id'], name) " } delete_resource() { local kind="$1" id="$2" name="$3" prefix "DELETE ${kind}/${id} (${name})" if [ "$MODE" = "apply" ]; then curl -sS -o /dev/null -X DELETE -H "$H" \ "https://api.hetzner.cloud/v1/${kind}/${id}" || red " WARN: delete returned non-zero for ${kind}/${id}" fi } ANY_FOUND=0 if [ "$HAVE_HETZNER_TOKEN" = "1" ]; then # Pass 1: label-selected resources, in dependency order (servers first so # LB/network/firewall freeing isn't blocked). for kind in $KINDS_LABELED ; do while IFS= read -r line; do [ -z "$line" ] && continue ANY_FOUND=1 id=$(echo "$line" | awk '{print $1}') name=$(echo "$line" | cut -d' ' -f2-) delete_resource "$kind" "$id" "$name" done < <(list_ids_labelled "$kind") done # Pass 2: ssh_keys (no labels — match by name slug). while IFS= read -r line; do [ -z "$line" ] && continue ANY_FOUND=1 id=$(echo "$line" | awk '{print $1}') name=$(echo "$line" | cut -d' ' -f2-) delete_resource "ssh_keys" "$id" "$name" done < <(list_ids_by_name "ssh_keys") # Pass 3: verification sweep — Hetzner DELETE often returns 204 even when # the resource persists (notably firewalls right after a server delete). # Re-query without the label filter and re-delete by-id any that linger. # Skipping this caused "name is already used (uniqueness_error)" on the # next provision attempt. See feedback_idempotent_iac_purge.md. echo yellow "── Verification sweep (catches Hetzner DELETE-returns-204-but-keeps-resource quirk) ──" for kind in firewalls networks load_balancers ssh_keys servers volumes primary_ips floating_ips ; do while IFS= read -r line; do [ -z "$line" ] && continue id=$(echo "$line" | awk '{print $1}') name=$(echo "$line" | cut -d' ' -f2-) yellow " Lingering ${kind}/${id} (${name}) — re-deleting" delete_resource "$kind" "$id" "$name" done < <(list_ids_by_name "$kind") done if [ "$ANY_FOUND" = "0" ]; then green " No Hetzner resources found for ${FQDN}. Already clean." fi else yellow " (Token not available — listing the resource names that WOULD be inspected.)" for kind in $KINDS_LABELED ssh_keys ; do prefix "QUERY ${kind}?label_selector=${LABEL_KEY}=${FQDN} (would DELETE any matches: catalyst-${SLUG}-*)" done echo yellow "── Verification sweep (would also re-list without label filter and re-delete catalyst-${SLUG}-* names) ──" for kind in firewalls networks load_balancers ssh_keys servers volumes primary_ips floating_ips ; do prefix "QUERY ${kind} (would re-DELETE any name matching slug ${SLUG})" done fi echo # ── Step 2 — PDM allocation release ─────────────────────────────────── bold "── Step 2 / 3 — Pool-domain-manager allocation release ───────────" # Sub-label = first label of the FQDN (omantel.omani.works -> omantel). SUB=$(echo "$FQDN" | cut -d. -f1) # Locate PDM. Prefer in-cluster service; fall back to env override. PDM_BASE_URL="${PDM_BASE_URL:-http://pool-domain-manager.openova-system.svc.cluster.local:8080}" prefix "DELETE ${PDM_BASE_URL}/api/v1/pool/${POOL_DOMAIN}/release?sub=${SUB}" if [ "$MODE" = "apply" ]; then # Run the curl from inside the cluster so the in-cluster DNS resolves. if kubectl -n openova-system get deploy pool-domain-manager >/dev/null 2>&1; then kubectl -n openova-system exec deploy/pool-domain-manager -- \ sh -c "wget -q -O - --method=DELETE --header='Content-Type: application/json' \ 'http://localhost:8080/api/v1/pool/${POOL_DOMAIN}/release?sub=${SUB}' || true" \ 2>/dev/null || true else yellow " pool-domain-manager Deployment not found in openova-system; skipping PDM release." yellow " If PDM lives elsewhere, set PDM_BASE_URL and re-run." fi else # Dry-run: just check whether the allocation exists. if kubectl -n openova-system get deploy pool-domain-manager >/dev/null 2>&1; then OUT=$(kubectl -n openova-system exec deploy/pool-domain-manager -- \ sh -c "wget -q -O - 'http://localhost:8080/api/v1/pool/${POOL_DOMAIN}/check?sub=${SUB}'" \ 2>/dev/null || echo '{}') AVAIL=$(echo "$OUT" | python3 -c "import json,sys; d=json.load(sys.stdin) if sys.stdin else {}; print(d.get('available','unknown'))" 2>/dev/null || echo unknown) case "$AVAIL" in true) green " PDM reports ${SUB}.${POOL_DOMAIN} is already AVAILABLE — nothing to release." ;; false) yellow " PDM reports ${SUB}.${POOL_DOMAIN} is currently RESERVED or COMMITTED — release will free it." ;; *) yellow " PDM check returned: ${OUT}" ;; esac else yellow " pool-domain-manager Deployment not found in openova-system (skipping check)." fi fi echo # ── Step 3 — catalyst-api deployment record ─────────────────────────── bold "── Step 3 / 3 — catalyst-api deployment record cancellation ──────" if ! kubectl -n "$NS" get deploy catalyst-api >/dev/null 2>&1; then yellow " catalyst-api Deployment not found in namespace ${NS}. Skipping." else # Find the record(s) for this FQDN. Records live at: # /var/lib/catalyst/deployments/.json # and contain { "request": { "sovereignFQDN": "...", ... }, "status": "...", ... } POD=$(kubectl -n "$NS" get pod -l app.kubernetes.io/name=catalyst-api \ -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true) if [ -z "$POD" ]; then yellow " No catalyst-api Pod found. Skipping deployment-record cancellation." else # The catalyst-api container is scratch-based — no python3, no jq, only # /bin/sh + the catalyst-api binary. We pull each record file out via # `kubectl exec ... cat` and parse the JSON locally. DEPLOY_FILES=$(kubectl -n "$NS" exec "$POD" -- sh -c \ "ls /var/lib/catalyst/deployments/*.json 2>/dev/null || true" 2>/dev/null || true) MATCHED_IDS="" for f in $DEPLOY_FILES; do [ -z "$f" ] && continue JSON=$(kubectl -n "$NS" exec "$POD" -- sh -c "cat '$f'" 2>/dev/null || true) RECORD_FQDN=$(echo "$JSON" | python3 -c " import json, sys try: d = json.load(sys.stdin) print(d.get('request', {}).get('sovereignFQDN', '')) except Exception: print('') " 2>/dev/null || echo "") RECORD_STATUS=$(echo "$JSON" | python3 -c " import json, sys try: d = json.load(sys.stdin) print(d.get('status', '')) except Exception: print('') " 2>/dev/null || echo "") if [ "$RECORD_FQDN" = "$FQDN" ]; then DID=$(basename "$f" .json) MATCHED_IDS="${MATCHED_IDS}${DID} ${RECORD_STATUS}\n" fi done if [ -z "$MATCHED_IDS" ]; then green " No catalyst-api deployment records reference ${FQDN}. Nothing to cancel." else printf '%b' "$MATCHED_IDS" | while IFS= read -r line; do [ -z "$line" ] && continue DID=$(echo "$line" | awk '{print $1}') DSTATUS=$(echo "$line" | awk '{print $2}') prefix "Mark deployment ${DID} (current status: ${DSTATUS}) -> status=cancelled" if [ "$MODE" = "apply" ]; then # Pull, mutate locally on the host (where python3 exists), push back. NOW=$(date -u +%Y-%m-%dT%H:%M:%SZ) ORIG=$(kubectl -n "$NS" exec "$POD" -- sh -c "cat /var/lib/catalyst/deployments/${DID}.json" 2>/dev/null || true) NEW=$(echo "$ORIG" | python3 -c " import json, sys d = json.load(sys.stdin) d['status'] = 'cancelled' d['finishedAt'] = '${NOW}' d.setdefault('events', []).append({ 'time': '${NOW}', 'phase': 'operator-recovery', 'level': 'warn', 'message': 'cancelled by scripts/operator-recover-sovereign.sh' }) print(json.dumps(d, indent=2)) " 2>/dev/null || true) if [ -n "$NEW" ]; then # Pipe new JSON back into the Pod via stdin -> tee. echo "$NEW" | kubectl -n "$NS" exec -i "$POD" -- \ sh -c "cat > /var/lib/catalyst/deployments/${DID}.json" \ || red " WARN: could not rewrite ${DID}.json" else red " WARN: could not parse ${DID}.json — skipping rewrite" fi fi done fi fi fi echo # ── Done ────────────────────────────────────────────────────────────── bold "===================================================================" if [ "$MODE" = "dry-run" ]; then yellow " DRY-RUN complete. No changes made." yellow " Re-run with --apply to actually purge." else green " RECOVERY APPLIED. The operator may now re-run Launch in the wizard" green " with sovereign-fqdn=${FQDN}. Re-runs are fully idempotent because" green " every Hetzner resource is named deterministically off the FQDN." fi bold "===================================================================" exit 0