diff --git a/clusters/_template/bootstrap-kit/06a-bp-self-sovereign-cutover.yaml b/clusters/_template/bootstrap-kit/06a-bp-self-sovereign-cutover.yaml index 856687da..ab2105fe 100644 --- a/clusters/_template/bootstrap-kit/06a-bp-self-sovereign-cutover.yaml +++ b/clusters/_template/bootstrap-kit/06a-bp-self-sovereign-cutover.yaml @@ -174,7 +174,21 @@ spec: # Also drops the pre-flight cutoverComplete=true short- # circuit since /internal/cutover/trigger is itself # idempotent. - version: 0.1.18 + # 0.1.19: Step-01 gitea-mirror DNS race + backoffLimit=0 (#968). + # 0.1.18 unblocked the auto-trigger so the cutover engine fired + # correctly on otech115 (2026-05-05) — but Step-01 then failed + # within 8s with `wget: bad address gitea-http.gitea.svc.cluster. + # local`. The gitea Pod had reached Ready ~2-3s prior; cluster- + # DNS endpoint propagation was still in flight. catalyst-api + # stamped the Job with `backoffLimit=0` (cutover.go:584), so + # one DNS miss was terminal and the cutover engine aborted all + # 8 steps. Fix is dual: (a) catalyst-api now stamps Jobs with + # `backoffLimit=3` so a single miss is recoverable; (b) Step-01 + # bash script gains an explicit `nslookup` readiness loop (30 × + # 5s) at the top, before any wget call. Both layers are needed — + # the in-script probe is fastest; the backoffLimit is the + # safety net for any other transient pre-cluster-stable race. + version: 0.1.19 sourceRef: kind: HelmRepository name: bp-self-sovereign-cutover diff --git a/platform/self-sovereign-cutover/chart/Chart.yaml b/platform/self-sovereign-cutover/chart/Chart.yaml index 125036bb..70ea77dc 100644 --- a/platform/self-sovereign-cutover/chart/Chart.yaml +++ b/platform/self-sovereign-cutover/chart/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v2 name: bp-self-sovereign-cutover -version: 0.1.18 +version: 0.1.19 description: | Catalyst Self-Sovereignty Cutover Blueprint. Installs DORMANT — this chart ships eight step ConfigMaps (PodSpec ConfigMaps, one per step), diff --git a/platform/self-sovereign-cutover/chart/templates/01-gitea-mirror-job.yaml b/platform/self-sovereign-cutover/chart/templates/01-gitea-mirror-job.yaml index 38f554cb..6f4eb101 100644 --- a/platform/self-sovereign-cutover/chart/templates/01-gitea-mirror-job.yaml +++ b/platform/self-sovereign-cutover/chart/templates/01-gitea-mirror-job.yaml @@ -76,6 +76,40 @@ data: echo "[gitea-mirror] target=${redacted_url}" echo "[gitea-mirror] mirror_interval=${MIRROR_INTERVAL}" + # #968 — DNS-readiness probe for gitea-http. + # + # The cutover auto-trigger fires within seconds of bp-self- + # sovereign-cutover Helm-install completing. On a fresh + # Sovereign the gitea Pod can still be moving from Running + # to Ready, in which case the headless service `gitea-http` + # has no DNS record published yet. Without this probe the + # very first wget call returns `bad address` and the Job + # exits non-zero. catalyst-api's cutover engine treats that + # as a hard failure (per cutover.go #968 backoffLimit was + # raised to 3, but local resolve here is cheaper and faster + # than burning Pod-restart budget). On otech115 2026-05-05 + # this race fired Step-01 at +8s after gitea reached Ready + # and DNS hadn't propagated; one nslookup wait of ~10s would + # have been sufficient. Loop budget = 30 × 5s = 150s, well + # under the step's activeDeadlineSeconds. + gitea_host="$(printf '%s' "${GITEA_INTERNAL_URL}" | sed -E 's|^https?://||' | cut -d: -f1 | cut -d/ -f1)" + if [ -n "${gitea_host}" ]; then + echo "[gitea-mirror] waiting for DNS resolution of ${gitea_host}" + dns_ready="false" + for i in $(seq 1 30); do + if nslookup "${gitea_host}" >/dev/null 2>&1; then + echo "[gitea-mirror] DNS ready for ${gitea_host} (attempt ${i})" + dns_ready="true" + break + fi + sleep 5 + done + if [ "${dns_ready}" != "true" ]; then + echo "[gitea-mirror] FATAL: ${gitea_host} did not resolve within 150s" >&2 + exit 1 + fi + fi + # Build BusyBox-wget-compatible Basic auth header. printf -n # avoids the trailing newline that would otherwise corrupt # the base64 encoding (and thus the credential). diff --git a/platform/self-sovereign-cutover/chart/tests/cutover-contract.sh b/platform/self-sovereign-cutover/chart/tests/cutover-contract.sh index be93a473..cc04342d 100755 --- a/platform/self-sovereign-cutover/chart/tests/cutover-contract.sh +++ b/platform/self-sovereign-cutover/chart/tests/cutover-contract.sh @@ -245,4 +245,26 @@ if grep -E "grep.*cutoverComplete.*/tmp/status\.json" "$TMP/render.yaml" >/dev/n fi echo " PASS (no stale cutoverComplete pre-read)" +echo "[cutover-contract] Case 15: Step-01 gitea-mirror has DNS-readiness probe (#968)" +# 0.1.18 Step-01 fired wget against gitea-http.gitea.svc.cluster.local +# the moment the auto-trigger fired, racing the gitea Pod's endpoint +# publication. One DNS miss returned `wget: bad address` and (combined +# with catalyst-api's backoffLimit=0) terminated the Job permanently +# — which the cutover engine surfaced as a hard cutover failure (caught +# live on otech115 2026-05-05). +# +# 0.1.19 Step-01 prefixes its wget calls with an `nslookup` readiness +# loop (30 x 5s) so the Job tolerates the ~10s endpoint-publish lag +# without burning Pod-restart budget. This gate guards against future +# regressions that drop the loop. +if ! grep -q 'nslookup "${gitea_host}"' "$TMP/render.yaml"; then + echo "FAIL: Step-01 gitea-mirror missing nslookup readiness probe (#968)" >&2 + exit 1 +fi +if ! grep -q 'gitea_host=' "$TMP/render.yaml"; then + echo "FAIL: Step-01 gitea-mirror missing gitea_host= variable extraction (#968)" >&2 + exit 1 +fi +echo " PASS (Step-01 has DNS readiness probe)" + echo "[cutover-contract] All gates green." diff --git a/products/catalyst/bootstrap/api/internal/handler/cutover.go b/products/catalyst/bootstrap/api/internal/handler/cutover.go index c7e6e6d2..ea5844b4 100644 --- a/products/catalyst/bootstrap/api/internal/handler/cutover.go +++ b/products/catalyst/bootstrap/api/internal/handler/cutover.go @@ -581,7 +581,16 @@ func cutoverJobName(stepName string, runEpoch int64) string { // hook-style Helm Jobs the bootstrap-kit uses elsewhere. func createCutoverJob(ctx context.Context, deps *cutoverDeps, step cutoverStep, runEpoch int64) (*batchv1.Job, error) { name := cutoverJobName(step.stepName, runEpoch) - backoffLimit := int32(0) // No retries — fail fast, surface to the operator. + // #968 — backoffLimit raised from 0 to 3 to absorb the gitea-mirror + // step's known race against gitea-http endpoint publication. The + // step Pod can land in scheduling within seconds of the gitea Pod + // reaching Ready, before cluster-DNS endpoint propagation. One DNS + // miss used to be terminal because the Job had no retry budget; + // the cutover engine then aborted all 8 steps. With backoffLimit=3 + // + the per-step DNS readiness probe (chart-side), a single miss + // is recoverable and steps still surface real failures (4× attempts + // over the activeDeadlineSeconds window). + backoffLimit := int32(3) ttl := int32(24 * 60 * 60) // 24h GC so the Job evidence stays around for audit. activeDeadline := int64(cutoverStepTimeout().Seconds()) job := &batchv1.Job{