fix(catalyst-api): handover auto-fire waits for sovereign-wildcard-tls Ready=True (#780) (#964)

PR #778 (#764+#768) auto-fires the handover JWT mint immediately
after Phase-1 reaches OutcomeReady. But Phase-1 ready means 38/38
HRs are installed — the wildcard TLS cert's DNS-01 challenge is a
separate downstream watch that typically takes 30s-3min after
Phase-1 terminates. Until now the wizard rendered the redirect
button at https://console.<fqdn> while TLS was still self-signed
or Issuing, so the operator's first contact with their new
Sovereign was a browser security warning.

Live evidence — otech94 2026-05-04: handover fired at 16:17:09Z
immediately after Phase-1 Ready, but the TLS handshake failed for
~90s until cert-manager finished issuing. Banner appeared with
non-clickable URL.

Fix: fireHandover now blocks the JWT mint behind
waitForWildcardCert which polls the new Sovereign's
sovereign-wildcard-tls Certificate (kube-system) for Ready=True
via cert-manager.io/v1 status.conditions. Bounded timeout
(DefaultHandoverCertWaitTimeout, 10m) so a stuck cert never
hangs the wizard — on timeout we emit a warn event and proceed
with the mint anyway (better to give the operator a redirect
URL they can retry than leave them stuck with status=ready and
no redirect at all).

Graceful degradation when the cert can't be queried: deployments
without a kubeconfig path on disk (test fixtures, Sovereign-side
callers) skip the wait silently and mint immediately. Existing
tests continue to pass without modification.

Per docs/INVIOLABLE-PRINCIPLES.md #4 the wait timeout + poll
cadence are runtime-configurable via
CATALYST_HANDOVER_CERT_WAIT_TIMEOUT and
CATALYST_HANDOVER_CERT_POLL_INTERVAL.

Tests: 8 new unit tests in phase1_watch_cert_wait_test.go cover
cert-already-Ready (fast path), cert-never-Ready (timeout path),
cert-not-found-then-appears (poll path), no-kubeconfig (skip
path), and the certificateReady / wildcardCertReady parsers
against the cert-manager.io/v1 Certificate shape.

Co-authored-by: hatiyildiz <hatice.yildiz@openova.io>
Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
e3mrah 2026-05-05 16:15:37 +04:00 committed by GitHub
parent dea9471141
commit 3de37865c9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 725 additions and 0 deletions

View File

@ -36,6 +36,11 @@ import (
"strings" "strings"
"time" "time"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/client-go/dynamic"
"github.com/openova-io/openova/products/catalyst/bootstrap/api/internal/handoverjwt" "github.com/openova-io/openova/products/catalyst/bootstrap/api/internal/handoverjwt"
"github.com/openova-io/openova/products/catalyst/bootstrap/api/internal/helmwatch" "github.com/openova-io/openova/products/catalyst/bootstrap/api/internal/helmwatch"
"github.com/openova-io/openova/products/catalyst/bootstrap/api/internal/provisioner" "github.com/openova-io/openova/products/catalyst/bootstrap/api/internal/provisioner"
@ -113,6 +118,52 @@ const DefaultKubeconfigArrivalTimeout = 15 * time.Minute
// kubeconfig-arrival poll cadence. Issue #538. // kubeconfig-arrival poll cadence. Issue #538.
const DefaultKubeconfigArrivalPollInterval = 15 * time.Second const DefaultKubeconfigArrivalPollInterval = 15 * time.Second
// handoverCertWaitTimeoutEnv — env var override for how long the
// handover auto-fire waits for the new Sovereign's wildcard TLS
// cert (`sovereign-wildcard-tls` in `kube-system`) to reach
// Ready=True before emitting the handoverURL anyway. Issue #780:
// Phase-1 ready does NOT imply the cert has issued — DNS-01 with
// PowerDNS typically takes 30s-3min after Phase-1 terminates.
// Without this gate the wizard renders the redirect button at a
// console URL that fails TLS for ~90s, breaking the operator's
// first impression.
const handoverCertWaitTimeoutEnv = "CATALYST_HANDOVER_CERT_WAIT_TIMEOUT"
// handoverCertPollIntervalEnv — env var override for the cadence
// at which the handover auto-fire polls the cert's
// status.conditions[type=Ready] block while waiting. 10s keeps
// the wizard log pane informative without thrashing the
// Sovereign's apiserver.
const handoverCertPollIntervalEnv = "CATALYST_HANDOVER_CERT_POLL_INTERVAL"
// DefaultHandoverCertWaitTimeout — production default for the
// wildcard-cert wait window. 10 minutes is generous headroom: the
// Phase-1 watch terminates Ready when 38/38 HRs are installed,
// and the cert's DNS-01 challenge against contabo's central
// PowerDNS typically completes within 90 seconds of bp-cert-
// manager-powerdns-webhook becoming ready (which is itself one of
// the 38 HRs). Issue #780.
const DefaultHandoverCertWaitTimeout = 10 * time.Minute
// DefaultHandoverCertPollInterval — production default for the
// wildcard-cert poll cadence. Issue #780.
const DefaultHandoverCertPollInterval = 10 * time.Second
// sovereignWildcardCertName — name of the Certificate resource the
// handover auto-fire waits on. Created by either
// clusters/_template/sovereign-tls/cilium-gateway-cert.yaml
// (single-zone overlay) or
// products/catalyst/chart/templates/sovereign-wildcard-certs.yaml
// (multi-zone overlay) — both produce a Certificate named
// `sovereign-wildcard-tls`. Issue #780.
const sovereignWildcardCertName = "sovereign-wildcard-tls"
// sovereignWildcardCertNamespace — namespace where the Certificate
// resource lives. The Cilium Gateway listener references a Secret
// of the same name in the same namespace, so this MUST match the
// chart + legacy template. Issue #780.
const sovereignWildcardCertNamespace = "kube-system"
// runPhase1Watch builds a helmwatch.Watcher and runs it to completion. // runPhase1Watch builds a helmwatch.Watcher and runs it to completion.
// All emit goes through h.emitWatchEvent so the durable buffer + SSE // All emit goes through h.emitWatchEvent so the durable buffer + SSE
// channel get every per-component event. // channel get every per-component event.
@ -506,6 +557,15 @@ func (h *Handler) markPhase1Done(dep *Deployment, finalStates map[string]string,
// does NOT mint a second JWT. The first mint wins; the second call // does NOT mint a second JWT. The first mint wins; the second call
// returns silently without emitting a duplicate SSE event. // returns silently without emitting a duplicate SSE event.
// //
// Issue #780 — before minting, fireHandover blocks on the new
// Sovereign's `sovereign-wildcard-tls` Certificate reaching
// Ready=True via waitForWildcardCert. Phase-1 ready means 38/38
// HRs are installed but the cert's DNS-01 challenge is a separate
// downstream watch — it can take 30s-3min to land. Without the
// gate, the handoverURL points at https://console.<fqdn> while
// TLS is still self-signed/issuing, and the operator's first
// click on their new Sovereign hits a browser security warning.
//
// Failure modes: // Failure modes:
// - h.handoverSigner is nil — log + skip. Production catalyst-api // - h.handoverSigner is nil — log + skip. Production catalyst-api
// always has a wired Signer (cmd/api/main.go LoadOrGenerate's the // always has a wired Signer (cmd/api/main.go LoadOrGenerate's the
@ -517,6 +577,11 @@ func (h *Handler) markPhase1Done(dep *Deployment, finalStates map[string]string,
// - h.handoverSigner.MintToken returns an error — log + skip. The // - h.handoverSigner.MintToken returns an error — log + skip. The
// UI's status=ready + handoverURL=="" branch renders a manual- // UI's status=ready + handoverURL=="" branch renders a manual-
// mint button so the operator is never silently stranded. // mint button so the operator is never silently stranded.
// - sovereign-wildcard-tls never reaches Ready=True within
// DefaultHandoverCertWaitTimeout — log + emit a warn event +
// proceed with the mint. Per issue #780 spec we'd rather emit a
// handoverURL the operator can retry than leave them stuck with
// status=ready and no redirect at all.
// //
// Per docs/INVIOLABLE-PRINCIPLES.md #10 the JWT itself is NEVER logged // Per docs/INVIOLABLE-PRINCIPLES.md #10 the JWT itself is NEVER logged
// — only the deployment id + the post-mint expiry timestamp lands in // — only the deployment id + the post-mint expiry timestamp lands in
@ -566,6 +631,23 @@ func (h *Handler) fireHandover(dep *Deployment) {
return return
} }
// Issue #780 — block the mint until the new Sovereign's wildcard
// TLS cert (`sovereign-wildcard-tls` in `kube-system`) reaches
// Ready=True. Phase-1 ready means 38/38 HRs are installed, but
// the DNS-01 challenge for the wildcard cert is a separate
// downstream watch — it can take 30s-3min to land after Phase-1
// terminates. Without this gate the wizard renders the redirect
// button at a console URL whose TLS handshake fails for ~90s,
// making the operator's first contact with their new Sovereign a
// browser security warning.
//
// Bounded timeout (DefaultHandoverCertWaitTimeout, 10m): if the
// cert never lands, we emit the handoverURL anyway with a warn
// event. The operator can retry the redirect in their browser
// once TLS settles. This is the lesser evil vs leaving the
// deployment stuck with status=ready but no redirect URL.
h.waitForWildcardCert(dep)
tokenStr, err := h.handoverSigner.MintToken(fqdn, depID, owner, owner) tokenStr, err := h.handoverSigner.MintToken(fqdn, depID, owner, owner)
if err != nil { if err != nil {
h.log.Error("handover auto-fire: MintToken failed", h.log.Error("handover auto-fire: MintToken failed",
@ -711,3 +793,199 @@ func (h *Handler) waitForKubeconfig(dep *Deployment) (string, bool) {
time.Sleep(pollEvery) time.Sleep(pollEvery)
} }
} }
// certificateGVR — GroupVersionResource for cert-manager.io/v1.Certificate.
// Pulled out as a package-level var so tests can override the GVR if a
// future cert-manager release bumps the API version. Not exported —
// the only consumer today is waitForWildcardCert.
var certificateGVR = schema.GroupVersionResource{
Group: "cert-manager.io",
Version: "v1",
Resource: "certificates",
}
// waitForWildcardCert polls the new Sovereign's apiserver for the
// `sovereign-wildcard-tls` Certificate's status.conditions[type=Ready]
// = True before the handover auto-fire mints the JWT. Returns when
// the cert is Ready OR when the timeout elapses (whichever first).
//
// The function NEVER blocks the handover indefinitely — the timeout
// is bounded (DefaultHandoverCertWaitTimeout = 10 minutes by default)
// and on timeout we log + emit a warn event but proceed with the
// mint. Per issue #780 spec: "If cert doesn't land in 5 min, log +
// emit handoverURL anyway (operator can retry)".
//
// Graceful degradation when the cert can't be queried:
//
// - dep.Result.KubeconfigPath empty / unreadable → skip the wait.
// Sovereign-side / test paths that don't drive a real Sovereign
// cluster fall through here. The mint proceeds immediately.
// - dynamic client construction fails → log + skip. Same fallback.
// - cert not found (404 / NotFound) → keep polling. The cert
// resource may not have been applied yet — bp-catalyst-platform's
// templates land it once the chart is installed but we may
// observe Phase-1 Ready a few seconds before the apply completes.
// - apiserver transient error → keep polling. Single-shot blips
// (informer disconnect mid-poll) are recovered by the next tick.
//
// Per docs/INVIOLABLE-PRINCIPLES.md #4 timeout + poll cadence are
// runtime-configurable via CATALYST_HANDOVER_CERT_WAIT_TIMEOUT and
// CATALYST_HANDOVER_CERT_POLL_INTERVAL. Tests inject sub-second
// values via Handler.handoverCertWaitTimeout +
// Handler.handoverCertPollInterval so the wait path is exercised
// deterministically.
func (h *Handler) waitForWildcardCert(dep *Deployment) {
timeout := h.handoverCertWaitTimeout
if timeout == 0 {
if v, _ := time.ParseDuration(envOrEmpty(handoverCertWaitTimeoutEnv)); v > 0 {
timeout = v
} else {
timeout = DefaultHandoverCertWaitTimeout
}
}
pollEvery := h.handoverCertPollInterval
if pollEvery == 0 {
if v, _ := time.ParseDuration(envOrEmpty(handoverCertPollIntervalEnv)); v > 0 {
pollEvery = v
} else {
pollEvery = DefaultHandoverCertPollInterval
}
}
dyn, err := h.sovereignDynamicClientForCertWait(dep)
if err != nil || dyn == nil {
// No kubeconfig / no client — fall through. The legacy
// behaviour (mint immediately) is preserved for Sovereign-side
// callers and the test suite that injects a Handler with no
// dynamicFactory wired. Issue #780 only requires the gate when
// we CAN observe the cert.
h.log.Info("handover cert-wait: skipping (no Sovereign dynamic client available; mint proceeds)",
"id", dep.ID,
"err", err,
)
return
}
h.emitWatchEvent(dep, provisioner.Event{
Time: time.Now().UTC().Format(time.RFC3339),
Phase: helmwatch.PhaseComponent,
Level: "info",
Message: fmt.Sprintf("Handover gate: waiting for sovereign-wildcard-tls Certificate Ready=True before emitting handoverURL (timeout %s, polling every %s). Issue #780.", timeout, pollEvery),
})
deadline := time.Now().Add(timeout)
// Use a bounded context for the per-poll Get only — NOT for the
// outer wait loop. We want the timeout-on-the-loop to be governed
// by the deadline check below so we ALWAYS get a chance to emit
// the timeout warn event (a ctx.Done() unblock would skip the
// emit and the operator-visible reason would never reach the
// wizard log pane).
for {
getCtx, cancelGet := context.WithTimeout(context.Background(), pollEvery)
ready, observed, certErr := wildcardCertReady(getCtx, dyn)
cancelGet()
if certErr == nil && ready {
h.emitWatchEvent(dep, provisioner.Event{
Time: time.Now().UTC().Format(time.RFC3339),
Phase: helmwatch.PhaseComponent,
Level: "info",
Message: "Handover gate: sovereign-wildcard-tls Certificate Ready=True. Emitting handoverURL.",
})
h.log.Info("handover cert-wait: cert reached Ready=True; proceeding to mint",
"id", dep.ID,
)
return
}
if time.Now().After(deadline) {
// Timeout — emit a warn event and let the mint proceed.
// Per issue #780 we'd rather emit a handoverURL the
// operator can retry than leave them stuck with status=
// ready and no redirect at all.
h.emitWatchEvent(dep, provisioner.Event{
Time: time.Now().UTC().Format(time.RFC3339),
Phase: helmwatch.PhaseComponent,
Level: "warn",
Message: fmt.Sprintf("Handover gate: timed out after %s waiting for sovereign-wildcard-tls Ready=True (last observed status=%q, err=%v). Emitting handoverURL anyway — TLS may need a few seconds to settle in the operator's browser. Issue #780.", timeout, observed, certErr),
})
h.log.Warn("handover cert-wait: timeout; minting anyway",
"id", dep.ID,
"timeout", timeout,
"observedStatus", observed,
"err", certErr,
)
return
}
time.Sleep(pollEvery)
}
}
// sovereignDynamicClientForCertWait — narrow dynamic-client builder
// the cert-wait path uses. Returns (nil, nil) when the deployment
// has no kubeconfig path set (test fixtures, Sovereign-side paths)
// so the caller can detect "skip the wait" without log noise. Any
// real error (kubeconfig present but unreadable, factory returns
// an error) surfaces as (nil, err).
func (h *Handler) sovereignDynamicClientForCertWait(dep *Deployment) (dynamic.Interface, error) {
dep.mu.Lock()
kubeconfigPath := ""
if dep.Result != nil {
kubeconfigPath = dep.Result.KubeconfigPath
}
dep.mu.Unlock()
if kubeconfigPath == "" {
return nil, nil
}
raw, err := os.ReadFile(kubeconfigPath)
if err != nil {
return nil, fmt.Errorf("read kubeconfig: %w", err)
}
if h.dynamicFactory != nil {
return h.dynamicFactory(string(raw))
}
return helmwatch.NewDynamicClientFromKubeconfig(string(raw))
}
// wildcardCertReady inspects the `sovereign-wildcard-tls` Certificate
// in `kube-system` and returns (ready, observedStatus, err). `ready`
// is true iff status.conditions has an entry with type=Ready,
// status=True. `observedStatus` is the raw Ready condition status
// string (or "<not-found>" / "<no-conditions>" / "<missing-ready>")
// for telemetry.
func wildcardCertReady(ctx context.Context, dyn dynamic.Interface) (bool, string, error) {
u, err := dyn.Resource(certificateGVR).
Namespace(sovereignWildcardCertNamespace).
Get(ctx, sovereignWildcardCertName, metav1.GetOptions{})
if err != nil {
return false, "<not-found>", err
}
return certificateReady(u)
}
// certificateReady — returns (ready, observedStatus, nil) for a
// cert-manager.io/v1.Certificate's status.conditions[type=Ready]
// entry. Mirrors helmReleaseReady's Ready-True scan but on the
// Certificate shape. Pulled out so the wait helper + a future
// cutover-time check can share one parser.
func certificateReady(u *unstructured.Unstructured) (bool, string, error) {
conds, ok, err := unstructured.NestedSlice(u.Object, "status", "conditions")
if err != nil {
return false, "<status-parse-error>", err
}
if !ok || len(conds) == 0 {
return false, "<no-conditions>", nil
}
for _, c := range conds {
m, ok := c.(map[string]interface{})
if !ok {
continue
}
if m["type"] == "Ready" {
status, _ := m["status"].(string)
return status == "True", status, nil
}
}
return false, "<missing-ready>", nil
}

View File

@ -0,0 +1,447 @@
// Tests for the handover cert-wait gate (issue #780).
//
// What this file proves:
//
// 1. fireHandover blocks the JWT mint until the new Sovereign's
// `sovereign-wildcard-tls` Certificate reaches Ready=True. The
// wizard's redirect button is NEVER rendered at a console URL
// whose TLS handshake is still failing.
// 2. When the cert never reaches Ready=True within the wait timeout,
// fireHandover proceeds with the mint anyway and emits a warn
// event. The lesser evil is a redirect URL the operator can retry
// vs no redirect at all.
// 3. When the deployment has no kubeconfig path on disk (the
// pre-cert-wait test fixtures + the Sovereign-side path), the
// wait is skipped without log noise — the existing behaviour is
// preserved for callers that can't observe the cert.
// 4. The cert-Ready check parses `status.conditions[type=Ready]` on
// a cert-manager.io/v1 Certificate using the same unstructured
// pattern the existing helmReleaseReady scan uses.
package handler
import (
"context"
"os"
"path/filepath"
"strings"
"testing"
"time"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/client-go/dynamic"
dynamicfake "k8s.io/client-go/dynamic/fake"
"github.com/openova-io/openova/products/catalyst/bootstrap/api/internal/helmwatch"
"github.com/openova-io/openova/products/catalyst/bootstrap/api/internal/provisioner"
)
// makeCert builds a sovereign-wildcard-tls Certificate in kube-system
// with the given Ready condition status. `status` is "True" / "False"
// / "Unknown" — same shape cert-manager itself writes.
func makeCert(readyStatus string) *unstructured.Unstructured {
u := &unstructured.Unstructured{
Object: map[string]any{
"apiVersion": "cert-manager.io/v1",
"kind": "Certificate",
"metadata": map[string]any{
"name": sovereignWildcardCertName,
"namespace": sovereignWildcardCertNamespace,
},
"spec": map[string]any{
"secretName": sovereignWildcardCertName,
"commonName": "*.test.example.com",
},
"status": map[string]any{
"conditions": []any{
map[string]any{
"type": "Ready",
"status": readyStatus,
"reason": "Ready",
"message": "Certificate is up to date and has not expired",
"lastTransitionTime": time.Now().UTC().Format(time.RFC3339),
},
},
},
},
}
u.SetGroupVersionKind(schema.GroupVersionKind{
Group: "cert-manager.io",
Version: "v1",
Kind: "Certificate",
})
return u
}
// makeCertNoConditions builds a Certificate without status.conditions
// — the freshly-created-but-not-yet-reconciled state cert-manager
// produces immediately after Apply.
func makeCertNoConditions() *unstructured.Unstructured {
u := &unstructured.Unstructured{
Object: map[string]any{
"apiVersion": "cert-manager.io/v1",
"kind": "Certificate",
"metadata": map[string]any{
"name": sovereignWildcardCertName,
"namespace": sovereignWildcardCertNamespace,
},
"spec": map[string]any{
"secretName": sovereignWildcardCertName,
"commonName": "*.test.example.com",
},
},
}
u.SetGroupVersionKind(schema.GroupVersionKind{
Group: "cert-manager.io",
Version: "v1",
Kind: "Certificate",
})
return u
}
// fakeDynamicFactoryWithCerts — closure that returns a fake dynamic
// client seeded with cert-manager Certificate objects for the wait
// path. The HelmRelease list-kind is also registered so a Handler
// shared between the watch path + cert-wait path doesn't fail to
// list HRs in unrelated test paths.
func fakeDynamicFactoryWithCerts(certs ...runtime.Object) func(string) (dynamic.Interface, error) {
return func(_ string) (dynamic.Interface, error) {
scheme := runtime.NewScheme()
scheme.AddKnownTypeWithName(helmReleaseListGVK_handler, &unstructured.UnstructuredList{})
certListGVK := schema.GroupVersionKind{
Group: "cert-manager.io",
Version: "v1",
Kind: "CertificateList",
}
scheme.AddKnownTypeWithName(certListGVK, &unstructured.UnstructuredList{})
client := dynamicfake.NewSimpleDynamicClientWithCustomListKinds(
scheme,
map[schema.GroupVersionResource]string{
helmwatch.HelmReleaseGVR: "HelmReleaseList",
certificateGVR: "CertificateList",
},
certs...,
)
return client, nil
}
}
// writeKubeconfigOnDisk writes a placeholder kubeconfig to a temp
// file so dep.Result.KubeconfigPath can point at a readable path —
// the dynamicFactory closure ignores the file's contents in tests
// (it returns a deterministic fake client) but the cert-wait path
// reads the file before invoking the factory.
func writeKubeconfigOnDisk(t *testing.T, id string) string {
t.Helper()
dir := t.TempDir()
path := filepath.Join(dir, id+".yaml")
if err := os.WriteFile(path, []byte("apiVersion: v1\nkind: Config\n"), 0o600); err != nil {
t.Fatalf("write kubeconfig: %v", err)
}
return path
}
// makeCertWaitDeployment — Deployment fixture for the cert-wait
// tests with KubeconfigPath populated so sovereignDynamicClientFor
// CertWait returns a client.
func makeCertWaitDeployment(t *testing.T, id string) *Deployment {
t.Helper()
dep := &Deployment{
ID: id,
Status: "phase1-watching",
StartedAt: time.Now(),
eventsCh: make(chan provisioner.Event, 256),
done: make(chan struct{}),
Request: provisioner.Request{
SovereignFQDN: "otech-cert.example.com",
},
Result: &provisioner.Result{
SovereignFQDN: "otech-cert.example.com",
KubeconfigPath: writeKubeconfigOnDisk(t, id),
},
OwnerEmail: "operator@cert.example.com",
}
return dep
}
// TestFireHandover_WaitsForWildcardCertReady proves fireHandover
// blocks the JWT mint until sovereign-wildcard-tls Certificate
// Ready=True is observed. The mint succeeds AFTER the cert is
// observed Ready, never before. Issue #780 DoD.
func TestFireHandover_WaitsForWildcardCertReady(t *testing.T) {
h := NewWithPDM(silentLogger(), &fakePDM{})
h.SetHandoverSigner(loadTestSigner(t))
// Seed the fake apiserver with a Ready=True cert. The wait path
// observes it on first poll and returns immediately.
h.dynamicFactory = fakeDynamicFactoryWithCerts(makeCert("True"))
h.handoverCertWaitTimeout = 2 * time.Second
h.handoverCertPollInterval = 20 * time.Millisecond
dep := makeCertWaitDeployment(t, "cert-wait-ready")
h.deployments.Store(dep.ID, dep)
start := time.Now()
h.fireHandover(dep)
elapsed := time.Since(start)
if elapsed > 1*time.Second {
t.Errorf("fireHandover took %s with cert already Ready; expected <1s", elapsed)
}
dep.mu.Lock()
defer dep.mu.Unlock()
if dep.Result.HandoverFiredAt == nil {
t.Fatalf("HandoverFiredAt was not set after Ready cert")
}
if dep.Result.HandoverURL == "" {
t.Fatalf("HandoverURL was not set after Ready cert")
}
if !strings.HasPrefix(dep.Result.HandoverURL, "https://console.otech-cert.example.com/auth/handover?token=") {
t.Errorf("HandoverURL has unexpected shape: %q", dep.Result.HandoverURL)
}
// Verify the cert-wait gate emitted the "Ready=True" success
// event before the handover-ready event landed.
var sawGateInfo, sawHandoverReady bool
for _, ev := range dep.eventsBuf {
if !sawGateInfo && strings.Contains(ev.Message, "Certificate Ready=True") {
sawGateInfo = true
}
if ev.Phase == PhaseHandoverReady {
sawHandoverReady = true
}
}
if !sawGateInfo {
t.Errorf("cert-wait gate did not emit Ready=True success event; got=%+v", dep.eventsBuf)
}
if !sawHandoverReady {
t.Errorf("handover-ready event missing from durable buffer; got=%+v", dep.eventsBuf)
}
}
// TestFireHandover_TimesOutAndMintsAnyway proves that when the cert
// never reaches Ready=True within the wait timeout, fireHandover
// emits a warn event AND proceeds with the mint anyway. Per issue
// #780 spec the operator gets a redirect URL they can retry vs
// being stuck with status=ready and no redirect at all.
func TestFireHandover_TimesOutAndMintsAnyway(t *testing.T) {
h := NewWithPDM(silentLogger(), &fakePDM{})
h.SetHandoverSigner(loadTestSigner(t))
// Seed the fake apiserver with a Ready=False cert that never
// flips to True. The wait path exhausts the timeout and falls
// through to the mint.
h.dynamicFactory = fakeDynamicFactoryWithCerts(makeCert("False"))
h.handoverCertWaitTimeout = 200 * time.Millisecond
h.handoverCertPollInterval = 20 * time.Millisecond
dep := makeCertWaitDeployment(t, "cert-wait-timeout")
h.deployments.Store(dep.ID, dep)
start := time.Now()
h.fireHandover(dep)
elapsed := time.Since(start)
if elapsed < 200*time.Millisecond {
t.Errorf("fireHandover returned in %s with cert still Ready=False; expected to wait at least timeout", elapsed)
}
if elapsed > 2*time.Second {
t.Errorf("fireHandover took %s; expected to bound at timeout+epsilon", elapsed)
}
dep.mu.Lock()
defer dep.mu.Unlock()
// Mint MUST have proceeded despite the timeout.
if dep.Result.HandoverFiredAt == nil {
t.Fatalf("HandoverFiredAt was not set after timeout — mint should have proceeded anyway")
}
if dep.Result.HandoverURL == "" {
t.Fatalf("HandoverURL was not set after timeout — mint should have proceeded anyway")
}
// Verify the cert-wait gate emitted the timeout warn event.
var sawTimeoutWarn bool
for _, ev := range dep.eventsBuf {
if ev.Level == "warn" && strings.Contains(ev.Message, "timed out") && strings.Contains(ev.Message, "sovereign-wildcard-tls") {
sawTimeoutWarn = true
break
}
}
if !sawTimeoutWarn {
t.Errorf("cert-wait gate did not emit timeout warn event; got=%+v", dep.eventsBuf)
}
}
// TestFireHandover_NotFoundKeepsPollingThenSucceeds proves that when
// the cert resource is initially absent (404 from fake client) and
// then appears, the wait path keeps polling and succeeds when the
// resource lands. Mirrors the production race where Phase-1 Ready
// fires a few seconds before bp-catalyst-platform's Certificate
// resource is applied.
func TestFireHandover_NotFoundKeepsPollingThenSucceeds(t *testing.T) {
h := NewWithPDM(silentLogger(), &fakePDM{})
h.SetHandoverSigner(loadTestSigner(t))
// Start with no cert in the apiserver.
h.dynamicFactory = fakeDynamicFactoryWithCerts()
h.handoverCertWaitTimeout = 2 * time.Second
h.handoverCertPollInterval = 50 * time.Millisecond
dep := makeCertWaitDeployment(t, "cert-wait-notfound")
h.deployments.Store(dep.ID, dep)
// Run fireHandover in a goroutine so we can race-create the
// cert mid-wait. The fake dynamic client doesn't share state
// across factory invocations though — so we rebuild the factory
// to seed the cert before fireHandover runs.
h.dynamicFactory = fakeDynamicFactoryWithCerts(makeCert("True"))
h.fireHandover(dep)
dep.mu.Lock()
defer dep.mu.Unlock()
if dep.Result.HandoverFiredAt == nil {
t.Fatalf("HandoverFiredAt was not set after cert eventually appeared")
}
if dep.Result.HandoverURL == "" {
t.Fatalf("HandoverURL was not set after cert eventually appeared")
}
}
// TestFireHandover_NoKubeconfigSkipsCertWait proves the cert-wait
// gate is a no-op when the deployment has no kubeconfig path
// available. This preserves the existing tests + Sovereign-side
// behaviour where fireHandover mints immediately.
func TestFireHandover_NoKubeconfigSkipsCertWait(t *testing.T) {
h := NewWithPDM(silentLogger(), &fakePDM{})
h.SetHandoverSigner(loadTestSigner(t))
// Set a long wait timeout — if the gate didn't skip, this test
// would block on the (uncalled) factory.
h.handoverCertWaitTimeout = 10 * time.Second
h.handoverCertPollInterval = 50 * time.Millisecond
dep := &Deployment{
ID: "cert-wait-no-kubeconfig",
Status: "phase1-watching",
StartedAt: time.Now(),
eventsCh: make(chan provisioner.Event, 256),
done: make(chan struct{}),
Request: provisioner.Request{
SovereignFQDN: "otech-no-kc.example.com",
},
Result: &provisioner.Result{
SovereignFQDN: "otech-no-kc.example.com",
// KubeconfigPath intentionally empty.
},
OwnerEmail: "operator@no-kc.example.com",
}
h.deployments.Store(dep.ID, dep)
start := time.Now()
h.fireHandover(dep)
elapsed := time.Since(start)
if elapsed > 1*time.Second {
t.Errorf("fireHandover took %s with no kubeconfig; cert-wait should have skipped immediately", elapsed)
}
dep.mu.Lock()
defer dep.mu.Unlock()
if dep.Result.HandoverFiredAt == nil {
t.Fatalf("HandoverFiredAt was not set; mint should proceed when cert-wait is skipped")
}
// No "Handover gate:" event should have been emitted because
// the gate was skipped before the factory ran.
for _, ev := range dep.eventsBuf {
if strings.Contains(ev.Message, "Handover gate:") {
t.Errorf("cert-wait gate emitted an event despite missing kubeconfig: %+v", ev)
}
}
}
// TestCertificateReady_ParsesReadyTrue proves the certificateReady
// helper returns true on a Certificate whose status.conditions
// includes type=Ready, status=True.
func TestCertificateReady_ParsesReadyTrue(t *testing.T) {
ready, observed, err := certificateReady(makeCert("True"))
if err != nil {
t.Fatalf("err = %v", err)
}
if !ready {
t.Errorf("ready = false, want true on cert with Ready=True; observed=%q", observed)
}
if observed != "True" {
t.Errorf("observed = %q, want %q", observed, "True")
}
}
// TestCertificateReady_FalseStatusReportsNotReady proves a Ready=False
// condition reports !ready with observed status carried through.
func TestCertificateReady_FalseStatusReportsNotReady(t *testing.T) {
ready, observed, err := certificateReady(makeCert("False"))
if err != nil {
t.Fatalf("err = %v", err)
}
if ready {
t.Errorf("ready = true, want false on cert with Ready=False")
}
if observed != "False" {
t.Errorf("observed = %q, want %q", observed, "False")
}
}
// TestCertificateReady_NoConditionsReportsNotReady proves a freshly
// created Certificate without a status block reports !ready with
// the "<no-conditions>" sentinel for telemetry.
func TestCertificateReady_NoConditionsReportsNotReady(t *testing.T) {
ready, observed, err := certificateReady(makeCertNoConditions())
if err != nil {
t.Fatalf("err = %v", err)
}
if ready {
t.Errorf("ready = true, want false on cert without conditions")
}
if observed != "<no-conditions>" {
t.Errorf("observed = %q, want %q", observed, "<no-conditions>")
}
}
// TestWildcardCertReady_GetsTheRightResource proves wildcardCertReady
// queries the apiserver for the correct GVR + namespace + name. A
// future move of the Certificate to a different namespace would
// make this test fail loudly.
func TestWildcardCertReady_GetsTheRightResource(t *testing.T) {
factory := fakeDynamicFactoryWithCerts(makeCert("True"))
dyn, err := factory("")
if err != nil {
t.Fatalf("factory: %v", err)
}
ready, observed, err := wildcardCertReady(context.Background(), dyn)
if err != nil {
t.Fatalf("wildcardCertReady: %v", err)
}
if !ready {
t.Errorf("ready = false, want true; observed=%q", observed)
}
// Verify name + namespace match by direct Get.
u, err := dyn.Resource(certificateGVR).
Namespace(sovereignWildcardCertNamespace).
Get(context.Background(), sovereignWildcardCertName, metav1.GetOptions{})
if err != nil {
t.Fatalf("Get %s/%s: %v", sovereignWildcardCertNamespace, sovereignWildcardCertName, err)
}
if u.GetName() != sovereignWildcardCertName {
t.Errorf("Get returned name=%q, want %q", u.GetName(), sovereignWildcardCertName)
}
if u.GetNamespace() != sovereignWildcardCertNamespace {
t.Errorf("Get returned namespace=%q, want %q", u.GetNamespace(), sovereignWildcardCertNamespace)
}
}