fix(bp-openbao): unseal on idempotent path + persist keys (Closes #539) (#540)

PR #528 added unseal logic but only on the FRESH-init branch. When a
previous Job pod completed `bao operator init` but exited before the
unseal block (or when openbao-0 simply restarts under shamir seal),
the next reconcile takes the "already initialized" branch and exits
without ever running `bao operator unseal`. Symptom on otech21:
init-job logs end with `auto-unseal init complete`, but
`bao status` reports Initialized=true Sealed=true forever, the
bp-openbao HR stays Unknown/Running for the full 15m install
timeout, and bp-external-secrets/bp-external-secrets-stores block
on the dep.

Fix has two parts:

1. Persist `unseal_keys_b64` on fresh init to a new K8s Secret
   `openbao-unseal-keys` (BEFORE applying the keys, so a unseal
   crash mid-step is recoverable on next retry).
2. Add a Step 2a "idempotent-path unseal" branch: when bao reports
   Initialized=true Sealed=true, fetch the persisted keys Secret
   and apply unseal exactly the same way Step 3a does on fresh
   init. Verify Sealed=false and exit; otherwise FATAL with the
   manual-recovery pointer.

RBAC: extend the openbao-auto-unseal Role to allow create/get/
patch/update on openbao-unseal-keys (alongside openbao-init-marker).

Chart bump 1.2.3 → 1.2.4. HR ref in
clusters/_template/bootstrap-kit/08-openbao.yaml updated to match
so cloud-init-templated Sovereigns pick up the new chart.

Co-authored-by: e3mrah <emrah.baysal@openova.io>
Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
e3mrah 2026-05-02 10:44:46 +04:00 committed by GitHub
parent 560d18a4d9
commit 8cde771c0f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 122 additions and 13 deletions

View File

@ -53,7 +53,7 @@ spec:
chart:
spec:
chart: bp-openbao
version: 1.2.3
version: 1.2.4
sourceRef:
kind: HelmRepository
name: bp-openbao

View File

@ -1,6 +1,6 @@
apiVersion: v2
name: bp-openbao
version: 1.2.3
version: 1.2.4
description: |
Catalyst-curated Blueprint umbrella chart for OpenBao. Depends on the
upstream `openbao` chart as a Helm subchart so `helm dependency build`

View File

@ -69,6 +69,11 @@ rules:
verbs: ["create", "get", "patch", "update"]
resourceNames:
- openbao-init-marker
# openbao-unseal-keys: persisted unseal-key set used by the init
# Job's idempotent-resume path (issue #539). Required so the Job
# can both write the keys on fresh init and re-read them on a
# later retry when the vault is initialised but still sealed.
- openbao-unseal-keys
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding

View File

@ -117,12 +117,12 @@ spec:
sleep 5
done
# ─── Step 2: idempotency check — skip if already initialised ─
# ─── Step 2: idempotency check — skip init if already initialised ─
# `bao status` exit code semantics:
# 0 — initialized AND unsealed
# 1 — error (not reachable)
# 2 — initialized but sealed
# We treat 0 OR 2 as "already initialized, exit success".
# We treat 0 OR 2 as "already initialized".
STATUS_RC=0
bao status >/dev/null 2>&1 || STATUS_RC=$?
if [ "$STATUS_RC" = "0" ] || [ "$STATUS_RC" = "2" ]; then
@ -133,16 +133,90 @@ spec:
SKIP_INIT=1
fi
fi
# k8s API context used in multiple steps below (idempotent
# unseal in Step 2a, marker write in Step 4, seed delete in
# Step 5). Define once.
TOKEN=$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)
CACERT=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt
APISERVER="https://kubernetes.default.svc"
UNSEAL_SECRET_NAME="openbao-unseal-keys"
# ─── Step 2a: idempotent-path unseal (issue #539) ──────────
# PR #528 only unsealed on the FRESH init path. If a previous
# Job pod completed `bao operator init` but crashed before
# the unseal block (or the unseal block failed), the next
# Job run takes the idempotent branch and would silently
# leave the vault sealed — the symptom #539 captured. Fix:
# on every run, if Sealed=true, fetch the persisted
# unseal-keys Secret (written by Step 3 on the original
# fresh init) and apply unseal here.
if [ -n "${SKIP_INIT:-}" ]; then
SEALED=$(bao status -format=json 2>/dev/null | grep -E '"sealed"' | head -1 | sed -E 's/.*"sealed"[[:space:]]*:[[:space:]]*(true|false).*/\1/')
if [ "$SEALED" = "true" ]; then
echo "[openbao-init] vault is sealed — fetching persisted unseal keys from $UNSEAL_SECRET_NAME"
UNSEAL_GET_RC=0
UNSEAL_RESPONSE=$(wget -qO- --no-check-certificate \
--header="Authorization: Bearer $TOKEN" \
"$APISERVER/api/v1/namespaces/$NAMESPACE/secrets/$UNSEAL_SECRET_NAME") || UNSEAL_GET_RC=$?
if [ "$UNSEAL_GET_RC" -ne 0 ] || [ -z "$UNSEAL_RESPONSE" ]; then
echo "[openbao-init] FATAL: cannot fetch $UNSEAL_SECRET_NAME — vault is sealed but the unseal-keys Secret is missing."
echo "[openbao-init] This means a prior init completed but never persisted keys (chart <1.2.4)."
echo "[openbao-init] Manual recovery: docs/RUNBOOK-PROVISIONING.md §openbao-auto-unseal — wipe data-openbao-0 PVC and let init run fresh."
exit 1
fi
KEYS_B64_FIELD=$(echo "$UNSEAL_RESPONSE" | tr -d '\n' | sed -E 's/.*"unseal-keys-b64"[[:space:]]*:[[:space:]]*"([^"]*)".*/\1/')
THRESHOLD_FIELD=$(echo "$UNSEAL_RESPONSE" | tr -d '\n' | sed -E 's/.*"unseal-threshold"[[:space:]]*:[[:space:]]*"([^"]*)".*/\1/')
if [ -z "$KEYS_B64_FIELD" ] || [ "$KEYS_B64_FIELD" = "$UNSEAL_RESPONSE" ]; then
echo "[openbao-init] FATAL: $UNSEAL_SECRET_NAME has no unseal-keys-b64 field"
exit 1
fi
# K8s Secret data fields are base64-encoded. Outer base64
# is the K8s wrapper; the decoded payload itself is a
# newline-separated list of unseal keys (each key is
# itself base64 — that's the OpenBao wire format).
echo "$KEYS_B64_FIELD" | base64 -d > /tmp/.unseal-keys
if [ -z "$THRESHOLD_FIELD" ] || [ "$THRESHOLD_FIELD" = "$UNSEAL_RESPONSE" ]; then
UNSEAL_THRESHOLD=1
else
UNSEAL_THRESHOLD=$(echo "$THRESHOLD_FIELD" | base64 -d)
fi
if [ -z "$UNSEAL_THRESHOLD" ] || [ "$UNSEAL_THRESHOLD" -lt 1 ] 2>/dev/null; then
UNSEAL_THRESHOLD=1
fi
KEY_COUNT=$(wc -l < /tmp/.unseal-keys | tr -d ' ')
if [ "$KEY_COUNT" -lt "$UNSEAL_THRESHOLD" ]; then
echo "[openbao-init] FATAL: persisted $KEY_COUNT key(s) but threshold=$UNSEAL_THRESHOLD"
exit 1
fi
echo "[openbao-init] applying $UNSEAL_THRESHOLD unseal key(s) from persisted Secret"
I=0
while [ "$I" -lt "$UNSEAL_THRESHOLD" ]; do
I=$((I+1))
KEY=$(sed -n "${I}p" /tmp/.unseal-keys)
if [ -z "$KEY" ]; then
echo "[openbao-init] FATAL: empty key at slot $I"
exit 1
fi
bao operator unseal "$KEY" >/dev/null
done
rm -f /tmp/.unseal-keys
SEALED_AFTER=$(bao status -format=json 2>/dev/null | grep -E '"sealed"' | head -1 | sed -E 's/.*"sealed"[[:space:]]*:[[:space:]]*(true|false).*/\1/')
if [ "$SEALED_AFTER" != "false" ]; then
echo "[openbao-init] FATAL: vault still sealed after applying $UNSEAL_THRESHOLD key(s) — sealed=$SEALED_AFTER"
bao status || true
exit 1
fi
echo "[openbao-init] resume-unsealed (sealed=false)"
else
echo "[openbao-init] vault already unsealed — nothing to do on idempotent path"
fi
fi
# ─── Step 3: read seed and run `bao operator init` ──────────
if [ -z "${SKIP_INIT:-}" ]; then
echo "[openbao-init] reading seed Secret $NAMESPACE/$SEED_SECRET_NAME"
# The upstream openbao image bundles `wget` and basic
# POSIX shell tools. We use the in-cluster K8s API via the
# ServiceAccount token mounted by Kubernetes.
TOKEN=$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)
CACERT=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt
APISERVER="https://kubernetes.default.svc"
# TOKEN/CACERT/APISERVER were defined in Step 2 above
# (used by both Step 2a unseal-resume and Step 3+4+5).
SEED_B64=$(wget -qO- --no-check-certificate \
--header="Authorization: Bearer $TOKEN" \
"$APISERVER/api/v1/namespaces/$NAMESPACE/secrets/$SEED_SECRET_NAME" \
@ -224,6 +298,38 @@ spec:
echo "[openbao-init] FATAL: extracted $KEY_COUNT unseal key(s) but threshold=$UNSEAL_THRESHOLD — see /tmp/init-output.json"
exit 1
fi
# ─── Persist unseal keys for idempotent recovery (issue #539) ─
# Store the unseal keys as a K8s Secret so future Job runs
# can take the idempotent path (Step 2a) and resume an
# already-initialised-but-still-sealed vault. Without this
# persistence, any pod restart of openbao-0 leaves it
# sealed forever (shamir seal type re-seals on restart),
# because the keys produced by `bao operator init` only
# ever existed in /tmp/.unseal-keys of the original Job
# pod. Writing this secret BEFORE unseal means even if
# the unseal block crashes the next retry can recover.
echo "[openbao-init] persisting unseal keys to Secret $UNSEAL_SECRET_NAME"
KEYS_PAYLOAD_B64=$(base64 < /tmp/.unseal-keys | tr -d '\n')
THRESHOLD_PAYLOAD_B64=$(printf '%s' "$UNSEAL_THRESHOLD" | base64 | tr -d '\n')
UNSEAL_SECRET_BODY=$(printf '{"apiVersion":"v1","kind":"Secret","metadata":{"name":"%s","namespace":"%s","labels":{"catalyst.openova.io/blueprint":"bp-openbao","catalyst.openova.io/component":"openbao-unseal-keys"}},"type":"Opaque","data":{"unseal-keys-b64":"%s","unseal-threshold":"%s"}}' \
"$UNSEAL_SECRET_NAME" "$NAMESPACE" "$KEYS_PAYLOAD_B64" "$THRESHOLD_PAYLOAD_B64")
US_CREATE_RC=0
wget -qO- --no-check-certificate \
--header="Authorization: Bearer $TOKEN" \
--header="Content-Type: application/json" \
--post-data="$UNSEAL_SECRET_BODY" \
"$APISERVER/api/v1/namespaces/$NAMESPACE/secrets" >/dev/null 2>&1 || US_CREATE_RC=$?
if [ "$US_CREATE_RC" -ne 0 ]; then
echo "[openbao-init] $UNSEAL_SECRET_NAME exists, replacing via PUT"
wget -qO- --no-check-certificate \
--header="Authorization: Bearer $TOKEN" \
--header="Content-Type: application/json" \
--method=PUT \
--body-data="$UNSEAL_SECRET_BODY" \
"$APISERVER/api/v1/namespaces/$NAMESPACE/secrets/$UNSEAL_SECRET_NAME" >/dev/null 2>&1 || true
fi
I=0
while [ "$I" -lt "$UNSEAL_THRESHOLD" ]; do
I=$((I+1))
@ -250,9 +356,7 @@ spec:
# ─── Step 4: write bootstrap-marker Secret ──────────────────
echo "[openbao-init] writing bootstrap-marker openbao-init-marker"
TOKEN=$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)
CACERT=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt
APISERVER="https://kubernetes.default.svc"
# TOKEN/CACERT/APISERVER were defined in Step 2 above.
MARKER_PAYLOAD=$(printf '{"apiVersion":"v1","kind":"Secret","metadata":{"name":"openbao-init-marker","namespace":"%s","labels":{"catalyst.openova.io/blueprint":"bp-openbao","catalyst.openova.io/component":"openbao-init-marker"}},"type":"Opaque","data":{"initialised-at":"%s"}}' \
"$NAMESPACE" "$(date -u +%Y-%m-%dT%H:%M:%SZ | base64 | tr -d '\n')")
# Try create; if it exists, patch.