PR #778 (#764+#768) auto-fires the handover JWT mint immediately after Phase-1 reaches OutcomeReady. But Phase-1 ready means 38/38 HRs are installed — the wildcard TLS cert's DNS-01 challenge is a separate downstream watch that typically takes 30s-3min after Phase-1 terminates. Until now the wizard rendered the redirect button at https://console.<fqdn> while TLS was still self-signed or Issuing, so the operator's first contact with their new Sovereign was a browser security warning. Live evidence — otech94 2026-05-04: handover fired at 16:17:09Z immediately after Phase-1 Ready, but the TLS handshake failed for ~90s until cert-manager finished issuing. Banner appeared with non-clickable URL. Fix: fireHandover now blocks the JWT mint behind waitForWildcardCert which polls the new Sovereign's sovereign-wildcard-tls Certificate (kube-system) for Ready=True via cert-manager.io/v1 status.conditions. Bounded timeout (DefaultHandoverCertWaitTimeout, 10m) so a stuck cert never hangs the wizard — on timeout we emit a warn event and proceed with the mint anyway (better to give the operator a redirect URL they can retry than leave them stuck with status=ready and no redirect at all). Graceful degradation when the cert can't be queried: deployments without a kubeconfig path on disk (test fixtures, Sovereign-side callers) skip the wait silently and mint immediately. Existing tests continue to pass without modification. Per docs/INVIOLABLE-PRINCIPLES.md #4 the wait timeout + poll cadence are runtime-configurable via CATALYST_HANDOVER_CERT_WAIT_TIMEOUT and CATALYST_HANDOVER_CERT_POLL_INTERVAL. Tests: 8 new unit tests in phase1_watch_cert_wait_test.go cover cert-already-Ready (fast path), cert-never-Ready (timeout path), cert-not-found-then-appears (poll path), no-kubeconfig (skip path), and the certificateReady / wildcardCertReady parsers against the cert-manager.io/v1 Certificate shape. Co-authored-by: hatiyildiz <hatice.yildiz@openova.io> Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
dea9471141
commit
3de37865c9
@ -36,6 +36,11 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||||
|
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
|
||||||
|
"k8s.io/apimachinery/pkg/runtime/schema"
|
||||||
|
"k8s.io/client-go/dynamic"
|
||||||
|
|
||||||
"github.com/openova-io/openova/products/catalyst/bootstrap/api/internal/handoverjwt"
|
"github.com/openova-io/openova/products/catalyst/bootstrap/api/internal/handoverjwt"
|
||||||
"github.com/openova-io/openova/products/catalyst/bootstrap/api/internal/helmwatch"
|
"github.com/openova-io/openova/products/catalyst/bootstrap/api/internal/helmwatch"
|
||||||
"github.com/openova-io/openova/products/catalyst/bootstrap/api/internal/provisioner"
|
"github.com/openova-io/openova/products/catalyst/bootstrap/api/internal/provisioner"
|
||||||
@ -113,6 +118,52 @@ const DefaultKubeconfigArrivalTimeout = 15 * time.Minute
|
|||||||
// kubeconfig-arrival poll cadence. Issue #538.
|
// kubeconfig-arrival poll cadence. Issue #538.
|
||||||
const DefaultKubeconfigArrivalPollInterval = 15 * time.Second
|
const DefaultKubeconfigArrivalPollInterval = 15 * time.Second
|
||||||
|
|
||||||
|
// handoverCertWaitTimeoutEnv — env var override for how long the
|
||||||
|
// handover auto-fire waits for the new Sovereign's wildcard TLS
|
||||||
|
// cert (`sovereign-wildcard-tls` in `kube-system`) to reach
|
||||||
|
// Ready=True before emitting the handoverURL anyway. Issue #780:
|
||||||
|
// Phase-1 ready does NOT imply the cert has issued — DNS-01 with
|
||||||
|
// PowerDNS typically takes 30s-3min after Phase-1 terminates.
|
||||||
|
// Without this gate the wizard renders the redirect button at a
|
||||||
|
// console URL that fails TLS for ~90s, breaking the operator's
|
||||||
|
// first impression.
|
||||||
|
const handoverCertWaitTimeoutEnv = "CATALYST_HANDOVER_CERT_WAIT_TIMEOUT"
|
||||||
|
|
||||||
|
// handoverCertPollIntervalEnv — env var override for the cadence
|
||||||
|
// at which the handover auto-fire polls the cert's
|
||||||
|
// status.conditions[type=Ready] block while waiting. 10s keeps
|
||||||
|
// the wizard log pane informative without thrashing the
|
||||||
|
// Sovereign's apiserver.
|
||||||
|
const handoverCertPollIntervalEnv = "CATALYST_HANDOVER_CERT_POLL_INTERVAL"
|
||||||
|
|
||||||
|
// DefaultHandoverCertWaitTimeout — production default for the
|
||||||
|
// wildcard-cert wait window. 10 minutes is generous headroom: the
|
||||||
|
// Phase-1 watch terminates Ready when 38/38 HRs are installed,
|
||||||
|
// and the cert's DNS-01 challenge against contabo's central
|
||||||
|
// PowerDNS typically completes within 90 seconds of bp-cert-
|
||||||
|
// manager-powerdns-webhook becoming ready (which is itself one of
|
||||||
|
// the 38 HRs). Issue #780.
|
||||||
|
const DefaultHandoverCertWaitTimeout = 10 * time.Minute
|
||||||
|
|
||||||
|
// DefaultHandoverCertPollInterval — production default for the
|
||||||
|
// wildcard-cert poll cadence. Issue #780.
|
||||||
|
const DefaultHandoverCertPollInterval = 10 * time.Second
|
||||||
|
|
||||||
|
// sovereignWildcardCertName — name of the Certificate resource the
|
||||||
|
// handover auto-fire waits on. Created by either
|
||||||
|
// clusters/_template/sovereign-tls/cilium-gateway-cert.yaml
|
||||||
|
// (single-zone overlay) or
|
||||||
|
// products/catalyst/chart/templates/sovereign-wildcard-certs.yaml
|
||||||
|
// (multi-zone overlay) — both produce a Certificate named
|
||||||
|
// `sovereign-wildcard-tls`. Issue #780.
|
||||||
|
const sovereignWildcardCertName = "sovereign-wildcard-tls"
|
||||||
|
|
||||||
|
// sovereignWildcardCertNamespace — namespace where the Certificate
|
||||||
|
// resource lives. The Cilium Gateway listener references a Secret
|
||||||
|
// of the same name in the same namespace, so this MUST match the
|
||||||
|
// chart + legacy template. Issue #780.
|
||||||
|
const sovereignWildcardCertNamespace = "kube-system"
|
||||||
|
|
||||||
// runPhase1Watch builds a helmwatch.Watcher and runs it to completion.
|
// runPhase1Watch builds a helmwatch.Watcher and runs it to completion.
|
||||||
// All emit goes through h.emitWatchEvent so the durable buffer + SSE
|
// All emit goes through h.emitWatchEvent so the durable buffer + SSE
|
||||||
// channel get every per-component event.
|
// channel get every per-component event.
|
||||||
@ -506,6 +557,15 @@ func (h *Handler) markPhase1Done(dep *Deployment, finalStates map[string]string,
|
|||||||
// does NOT mint a second JWT. The first mint wins; the second call
|
// does NOT mint a second JWT. The first mint wins; the second call
|
||||||
// returns silently without emitting a duplicate SSE event.
|
// returns silently without emitting a duplicate SSE event.
|
||||||
//
|
//
|
||||||
|
// Issue #780 — before minting, fireHandover blocks on the new
|
||||||
|
// Sovereign's `sovereign-wildcard-tls` Certificate reaching
|
||||||
|
// Ready=True via waitForWildcardCert. Phase-1 ready means 38/38
|
||||||
|
// HRs are installed but the cert's DNS-01 challenge is a separate
|
||||||
|
// downstream watch — it can take 30s-3min to land. Without the
|
||||||
|
// gate, the handoverURL points at https://console.<fqdn> while
|
||||||
|
// TLS is still self-signed/issuing, and the operator's first
|
||||||
|
// click on their new Sovereign hits a browser security warning.
|
||||||
|
//
|
||||||
// Failure modes:
|
// Failure modes:
|
||||||
// - h.handoverSigner is nil — log + skip. Production catalyst-api
|
// - h.handoverSigner is nil — log + skip. Production catalyst-api
|
||||||
// always has a wired Signer (cmd/api/main.go LoadOrGenerate's the
|
// always has a wired Signer (cmd/api/main.go LoadOrGenerate's the
|
||||||
@ -517,6 +577,11 @@ func (h *Handler) markPhase1Done(dep *Deployment, finalStates map[string]string,
|
|||||||
// - h.handoverSigner.MintToken returns an error — log + skip. The
|
// - h.handoverSigner.MintToken returns an error — log + skip. The
|
||||||
// UI's status=ready + handoverURL=="" branch renders a manual-
|
// UI's status=ready + handoverURL=="" branch renders a manual-
|
||||||
// mint button so the operator is never silently stranded.
|
// mint button so the operator is never silently stranded.
|
||||||
|
// - sovereign-wildcard-tls never reaches Ready=True within
|
||||||
|
// DefaultHandoverCertWaitTimeout — log + emit a warn event +
|
||||||
|
// proceed with the mint. Per issue #780 spec we'd rather emit a
|
||||||
|
// handoverURL the operator can retry than leave them stuck with
|
||||||
|
// status=ready and no redirect at all.
|
||||||
//
|
//
|
||||||
// Per docs/INVIOLABLE-PRINCIPLES.md #10 the JWT itself is NEVER logged
|
// Per docs/INVIOLABLE-PRINCIPLES.md #10 the JWT itself is NEVER logged
|
||||||
// — only the deployment id + the post-mint expiry timestamp lands in
|
// — only the deployment id + the post-mint expiry timestamp lands in
|
||||||
@ -566,6 +631,23 @@ func (h *Handler) fireHandover(dep *Deployment) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Issue #780 — block the mint until the new Sovereign's wildcard
|
||||||
|
// TLS cert (`sovereign-wildcard-tls` in `kube-system`) reaches
|
||||||
|
// Ready=True. Phase-1 ready means 38/38 HRs are installed, but
|
||||||
|
// the DNS-01 challenge for the wildcard cert is a separate
|
||||||
|
// downstream watch — it can take 30s-3min to land after Phase-1
|
||||||
|
// terminates. Without this gate the wizard renders the redirect
|
||||||
|
// button at a console URL whose TLS handshake fails for ~90s,
|
||||||
|
// making the operator's first contact with their new Sovereign a
|
||||||
|
// browser security warning.
|
||||||
|
//
|
||||||
|
// Bounded timeout (DefaultHandoverCertWaitTimeout, 10m): if the
|
||||||
|
// cert never lands, we emit the handoverURL anyway with a warn
|
||||||
|
// event. The operator can retry the redirect in their browser
|
||||||
|
// once TLS settles. This is the lesser evil vs leaving the
|
||||||
|
// deployment stuck with status=ready but no redirect URL.
|
||||||
|
h.waitForWildcardCert(dep)
|
||||||
|
|
||||||
tokenStr, err := h.handoverSigner.MintToken(fqdn, depID, owner, owner)
|
tokenStr, err := h.handoverSigner.MintToken(fqdn, depID, owner, owner)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
h.log.Error("handover auto-fire: MintToken failed",
|
h.log.Error("handover auto-fire: MintToken failed",
|
||||||
@ -711,3 +793,199 @@ func (h *Handler) waitForKubeconfig(dep *Deployment) (string, bool) {
|
|||||||
time.Sleep(pollEvery)
|
time.Sleep(pollEvery)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// certificateGVR — GroupVersionResource for cert-manager.io/v1.Certificate.
|
||||||
|
// Pulled out as a package-level var so tests can override the GVR if a
|
||||||
|
// future cert-manager release bumps the API version. Not exported —
|
||||||
|
// the only consumer today is waitForWildcardCert.
|
||||||
|
var certificateGVR = schema.GroupVersionResource{
|
||||||
|
Group: "cert-manager.io",
|
||||||
|
Version: "v1",
|
||||||
|
Resource: "certificates",
|
||||||
|
}
|
||||||
|
|
||||||
|
// waitForWildcardCert polls the new Sovereign's apiserver for the
|
||||||
|
// `sovereign-wildcard-tls` Certificate's status.conditions[type=Ready]
|
||||||
|
// = True before the handover auto-fire mints the JWT. Returns when
|
||||||
|
// the cert is Ready OR when the timeout elapses (whichever first).
|
||||||
|
//
|
||||||
|
// The function NEVER blocks the handover indefinitely — the timeout
|
||||||
|
// is bounded (DefaultHandoverCertWaitTimeout = 10 minutes by default)
|
||||||
|
// and on timeout we log + emit a warn event but proceed with the
|
||||||
|
// mint. Per issue #780 spec: "If cert doesn't land in 5 min, log +
|
||||||
|
// emit handoverURL anyway (operator can retry)".
|
||||||
|
//
|
||||||
|
// Graceful degradation when the cert can't be queried:
|
||||||
|
//
|
||||||
|
// - dep.Result.KubeconfigPath empty / unreadable → skip the wait.
|
||||||
|
// Sovereign-side / test paths that don't drive a real Sovereign
|
||||||
|
// cluster fall through here. The mint proceeds immediately.
|
||||||
|
// - dynamic client construction fails → log + skip. Same fallback.
|
||||||
|
// - cert not found (404 / NotFound) → keep polling. The cert
|
||||||
|
// resource may not have been applied yet — bp-catalyst-platform's
|
||||||
|
// templates land it once the chart is installed but we may
|
||||||
|
// observe Phase-1 Ready a few seconds before the apply completes.
|
||||||
|
// - apiserver transient error → keep polling. Single-shot blips
|
||||||
|
// (informer disconnect mid-poll) are recovered by the next tick.
|
||||||
|
//
|
||||||
|
// Per docs/INVIOLABLE-PRINCIPLES.md #4 timeout + poll cadence are
|
||||||
|
// runtime-configurable via CATALYST_HANDOVER_CERT_WAIT_TIMEOUT and
|
||||||
|
// CATALYST_HANDOVER_CERT_POLL_INTERVAL. Tests inject sub-second
|
||||||
|
// values via Handler.handoverCertWaitTimeout +
|
||||||
|
// Handler.handoverCertPollInterval so the wait path is exercised
|
||||||
|
// deterministically.
|
||||||
|
func (h *Handler) waitForWildcardCert(dep *Deployment) {
|
||||||
|
timeout := h.handoverCertWaitTimeout
|
||||||
|
if timeout == 0 {
|
||||||
|
if v, _ := time.ParseDuration(envOrEmpty(handoverCertWaitTimeoutEnv)); v > 0 {
|
||||||
|
timeout = v
|
||||||
|
} else {
|
||||||
|
timeout = DefaultHandoverCertWaitTimeout
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pollEvery := h.handoverCertPollInterval
|
||||||
|
if pollEvery == 0 {
|
||||||
|
if v, _ := time.ParseDuration(envOrEmpty(handoverCertPollIntervalEnv)); v > 0 {
|
||||||
|
pollEvery = v
|
||||||
|
} else {
|
||||||
|
pollEvery = DefaultHandoverCertPollInterval
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
dyn, err := h.sovereignDynamicClientForCertWait(dep)
|
||||||
|
if err != nil || dyn == nil {
|
||||||
|
// No kubeconfig / no client — fall through. The legacy
|
||||||
|
// behaviour (mint immediately) is preserved for Sovereign-side
|
||||||
|
// callers and the test suite that injects a Handler with no
|
||||||
|
// dynamicFactory wired. Issue #780 only requires the gate when
|
||||||
|
// we CAN observe the cert.
|
||||||
|
h.log.Info("handover cert-wait: skipping (no Sovereign dynamic client available; mint proceeds)",
|
||||||
|
"id", dep.ID,
|
||||||
|
"err", err,
|
||||||
|
)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
h.emitWatchEvent(dep, provisioner.Event{
|
||||||
|
Time: time.Now().UTC().Format(time.RFC3339),
|
||||||
|
Phase: helmwatch.PhaseComponent,
|
||||||
|
Level: "info",
|
||||||
|
Message: fmt.Sprintf("Handover gate: waiting for sovereign-wildcard-tls Certificate Ready=True before emitting handoverURL (timeout %s, polling every %s). Issue #780.", timeout, pollEvery),
|
||||||
|
})
|
||||||
|
|
||||||
|
deadline := time.Now().Add(timeout)
|
||||||
|
// Use a bounded context for the per-poll Get only — NOT for the
|
||||||
|
// outer wait loop. We want the timeout-on-the-loop to be governed
|
||||||
|
// by the deadline check below so we ALWAYS get a chance to emit
|
||||||
|
// the timeout warn event (a ctx.Done() unblock would skip the
|
||||||
|
// emit and the operator-visible reason would never reach the
|
||||||
|
// wizard log pane).
|
||||||
|
|
||||||
|
for {
|
||||||
|
getCtx, cancelGet := context.WithTimeout(context.Background(), pollEvery)
|
||||||
|
ready, observed, certErr := wildcardCertReady(getCtx, dyn)
|
||||||
|
cancelGet()
|
||||||
|
if certErr == nil && ready {
|
||||||
|
h.emitWatchEvent(dep, provisioner.Event{
|
||||||
|
Time: time.Now().UTC().Format(time.RFC3339),
|
||||||
|
Phase: helmwatch.PhaseComponent,
|
||||||
|
Level: "info",
|
||||||
|
Message: "Handover gate: sovereign-wildcard-tls Certificate Ready=True. Emitting handoverURL.",
|
||||||
|
})
|
||||||
|
h.log.Info("handover cert-wait: cert reached Ready=True; proceeding to mint",
|
||||||
|
"id", dep.ID,
|
||||||
|
)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if time.Now().After(deadline) {
|
||||||
|
// Timeout — emit a warn event and let the mint proceed.
|
||||||
|
// Per issue #780 we'd rather emit a handoverURL the
|
||||||
|
// operator can retry than leave them stuck with status=
|
||||||
|
// ready and no redirect at all.
|
||||||
|
h.emitWatchEvent(dep, provisioner.Event{
|
||||||
|
Time: time.Now().UTC().Format(time.RFC3339),
|
||||||
|
Phase: helmwatch.PhaseComponent,
|
||||||
|
Level: "warn",
|
||||||
|
Message: fmt.Sprintf("Handover gate: timed out after %s waiting for sovereign-wildcard-tls Ready=True (last observed status=%q, err=%v). Emitting handoverURL anyway — TLS may need a few seconds to settle in the operator's browser. Issue #780.", timeout, observed, certErr),
|
||||||
|
})
|
||||||
|
h.log.Warn("handover cert-wait: timeout; minting anyway",
|
||||||
|
"id", dep.ID,
|
||||||
|
"timeout", timeout,
|
||||||
|
"observedStatus", observed,
|
||||||
|
"err", certErr,
|
||||||
|
)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
time.Sleep(pollEvery)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// sovereignDynamicClientForCertWait — narrow dynamic-client builder
|
||||||
|
// the cert-wait path uses. Returns (nil, nil) when the deployment
|
||||||
|
// has no kubeconfig path set (test fixtures, Sovereign-side paths)
|
||||||
|
// so the caller can detect "skip the wait" without log noise. Any
|
||||||
|
// real error (kubeconfig present but unreadable, factory returns
|
||||||
|
// an error) surfaces as (nil, err).
|
||||||
|
func (h *Handler) sovereignDynamicClientForCertWait(dep *Deployment) (dynamic.Interface, error) {
|
||||||
|
dep.mu.Lock()
|
||||||
|
kubeconfigPath := ""
|
||||||
|
if dep.Result != nil {
|
||||||
|
kubeconfigPath = dep.Result.KubeconfigPath
|
||||||
|
}
|
||||||
|
dep.mu.Unlock()
|
||||||
|
if kubeconfigPath == "" {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
raw, err := os.ReadFile(kubeconfigPath)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("read kubeconfig: %w", err)
|
||||||
|
}
|
||||||
|
if h.dynamicFactory != nil {
|
||||||
|
return h.dynamicFactory(string(raw))
|
||||||
|
}
|
||||||
|
return helmwatch.NewDynamicClientFromKubeconfig(string(raw))
|
||||||
|
}
|
||||||
|
|
||||||
|
// wildcardCertReady inspects the `sovereign-wildcard-tls` Certificate
|
||||||
|
// in `kube-system` and returns (ready, observedStatus, err). `ready`
|
||||||
|
// is true iff status.conditions has an entry with type=Ready,
|
||||||
|
// status=True. `observedStatus` is the raw Ready condition status
|
||||||
|
// string (or "<not-found>" / "<no-conditions>" / "<missing-ready>")
|
||||||
|
// for telemetry.
|
||||||
|
func wildcardCertReady(ctx context.Context, dyn dynamic.Interface) (bool, string, error) {
|
||||||
|
u, err := dyn.Resource(certificateGVR).
|
||||||
|
Namespace(sovereignWildcardCertNamespace).
|
||||||
|
Get(ctx, sovereignWildcardCertName, metav1.GetOptions{})
|
||||||
|
if err != nil {
|
||||||
|
return false, "<not-found>", err
|
||||||
|
}
|
||||||
|
return certificateReady(u)
|
||||||
|
}
|
||||||
|
|
||||||
|
// certificateReady — returns (ready, observedStatus, nil) for a
|
||||||
|
// cert-manager.io/v1.Certificate's status.conditions[type=Ready]
|
||||||
|
// entry. Mirrors helmReleaseReady's Ready-True scan but on the
|
||||||
|
// Certificate shape. Pulled out so the wait helper + a future
|
||||||
|
// cutover-time check can share one parser.
|
||||||
|
func certificateReady(u *unstructured.Unstructured) (bool, string, error) {
|
||||||
|
conds, ok, err := unstructured.NestedSlice(u.Object, "status", "conditions")
|
||||||
|
if err != nil {
|
||||||
|
return false, "<status-parse-error>", err
|
||||||
|
}
|
||||||
|
if !ok || len(conds) == 0 {
|
||||||
|
return false, "<no-conditions>", nil
|
||||||
|
}
|
||||||
|
for _, c := range conds {
|
||||||
|
m, ok := c.(map[string]interface{})
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if m["type"] == "Ready" {
|
||||||
|
status, _ := m["status"].(string)
|
||||||
|
return status == "True", status, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false, "<missing-ready>", nil
|
||||||
|
}
|
||||||
|
|||||||
@ -0,0 +1,447 @@
|
|||||||
|
// Tests for the handover cert-wait gate (issue #780).
|
||||||
|
//
|
||||||
|
// What this file proves:
|
||||||
|
//
|
||||||
|
// 1. fireHandover blocks the JWT mint until the new Sovereign's
|
||||||
|
// `sovereign-wildcard-tls` Certificate reaches Ready=True. The
|
||||||
|
// wizard's redirect button is NEVER rendered at a console URL
|
||||||
|
// whose TLS handshake is still failing.
|
||||||
|
// 2. When the cert never reaches Ready=True within the wait timeout,
|
||||||
|
// fireHandover proceeds with the mint anyway and emits a warn
|
||||||
|
// event. The lesser evil is a redirect URL the operator can retry
|
||||||
|
// vs no redirect at all.
|
||||||
|
// 3. When the deployment has no kubeconfig path on disk (the
|
||||||
|
// pre-cert-wait test fixtures + the Sovereign-side path), the
|
||||||
|
// wait is skipped without log noise — the existing behaviour is
|
||||||
|
// preserved for callers that can't observe the cert.
|
||||||
|
// 4. The cert-Ready check parses `status.conditions[type=Ready]` on
|
||||||
|
// a cert-manager.io/v1 Certificate using the same unstructured
|
||||||
|
// pattern the existing helmReleaseReady scan uses.
|
||||||
|
package handler
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||||
|
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
|
||||||
|
"k8s.io/apimachinery/pkg/runtime"
|
||||||
|
"k8s.io/apimachinery/pkg/runtime/schema"
|
||||||
|
"k8s.io/client-go/dynamic"
|
||||||
|
dynamicfake "k8s.io/client-go/dynamic/fake"
|
||||||
|
|
||||||
|
"github.com/openova-io/openova/products/catalyst/bootstrap/api/internal/helmwatch"
|
||||||
|
"github.com/openova-io/openova/products/catalyst/bootstrap/api/internal/provisioner"
|
||||||
|
)
|
||||||
|
|
||||||
|
// makeCert builds a sovereign-wildcard-tls Certificate in kube-system
|
||||||
|
// with the given Ready condition status. `status` is "True" / "False"
|
||||||
|
// / "Unknown" — same shape cert-manager itself writes.
|
||||||
|
func makeCert(readyStatus string) *unstructured.Unstructured {
|
||||||
|
u := &unstructured.Unstructured{
|
||||||
|
Object: map[string]any{
|
||||||
|
"apiVersion": "cert-manager.io/v1",
|
||||||
|
"kind": "Certificate",
|
||||||
|
"metadata": map[string]any{
|
||||||
|
"name": sovereignWildcardCertName,
|
||||||
|
"namespace": sovereignWildcardCertNamespace,
|
||||||
|
},
|
||||||
|
"spec": map[string]any{
|
||||||
|
"secretName": sovereignWildcardCertName,
|
||||||
|
"commonName": "*.test.example.com",
|
||||||
|
},
|
||||||
|
"status": map[string]any{
|
||||||
|
"conditions": []any{
|
||||||
|
map[string]any{
|
||||||
|
"type": "Ready",
|
||||||
|
"status": readyStatus,
|
||||||
|
"reason": "Ready",
|
||||||
|
"message": "Certificate is up to date and has not expired",
|
||||||
|
"lastTransitionTime": time.Now().UTC().Format(time.RFC3339),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
u.SetGroupVersionKind(schema.GroupVersionKind{
|
||||||
|
Group: "cert-manager.io",
|
||||||
|
Version: "v1",
|
||||||
|
Kind: "Certificate",
|
||||||
|
})
|
||||||
|
return u
|
||||||
|
}
|
||||||
|
|
||||||
|
// makeCertNoConditions builds a Certificate without status.conditions
|
||||||
|
// — the freshly-created-but-not-yet-reconciled state cert-manager
|
||||||
|
// produces immediately after Apply.
|
||||||
|
func makeCertNoConditions() *unstructured.Unstructured {
|
||||||
|
u := &unstructured.Unstructured{
|
||||||
|
Object: map[string]any{
|
||||||
|
"apiVersion": "cert-manager.io/v1",
|
||||||
|
"kind": "Certificate",
|
||||||
|
"metadata": map[string]any{
|
||||||
|
"name": sovereignWildcardCertName,
|
||||||
|
"namespace": sovereignWildcardCertNamespace,
|
||||||
|
},
|
||||||
|
"spec": map[string]any{
|
||||||
|
"secretName": sovereignWildcardCertName,
|
||||||
|
"commonName": "*.test.example.com",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
u.SetGroupVersionKind(schema.GroupVersionKind{
|
||||||
|
Group: "cert-manager.io",
|
||||||
|
Version: "v1",
|
||||||
|
Kind: "Certificate",
|
||||||
|
})
|
||||||
|
return u
|
||||||
|
}
|
||||||
|
|
||||||
|
// fakeDynamicFactoryWithCerts — closure that returns a fake dynamic
|
||||||
|
// client seeded with cert-manager Certificate objects for the wait
|
||||||
|
// path. The HelmRelease list-kind is also registered so a Handler
|
||||||
|
// shared between the watch path + cert-wait path doesn't fail to
|
||||||
|
// list HRs in unrelated test paths.
|
||||||
|
func fakeDynamicFactoryWithCerts(certs ...runtime.Object) func(string) (dynamic.Interface, error) {
|
||||||
|
return func(_ string) (dynamic.Interface, error) {
|
||||||
|
scheme := runtime.NewScheme()
|
||||||
|
scheme.AddKnownTypeWithName(helmReleaseListGVK_handler, &unstructured.UnstructuredList{})
|
||||||
|
certListGVK := schema.GroupVersionKind{
|
||||||
|
Group: "cert-manager.io",
|
||||||
|
Version: "v1",
|
||||||
|
Kind: "CertificateList",
|
||||||
|
}
|
||||||
|
scheme.AddKnownTypeWithName(certListGVK, &unstructured.UnstructuredList{})
|
||||||
|
client := dynamicfake.NewSimpleDynamicClientWithCustomListKinds(
|
||||||
|
scheme,
|
||||||
|
map[schema.GroupVersionResource]string{
|
||||||
|
helmwatch.HelmReleaseGVR: "HelmReleaseList",
|
||||||
|
certificateGVR: "CertificateList",
|
||||||
|
},
|
||||||
|
certs...,
|
||||||
|
)
|
||||||
|
return client, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// writeKubeconfigOnDisk writes a placeholder kubeconfig to a temp
|
||||||
|
// file so dep.Result.KubeconfigPath can point at a readable path —
|
||||||
|
// the dynamicFactory closure ignores the file's contents in tests
|
||||||
|
// (it returns a deterministic fake client) but the cert-wait path
|
||||||
|
// reads the file before invoking the factory.
|
||||||
|
func writeKubeconfigOnDisk(t *testing.T, id string) string {
|
||||||
|
t.Helper()
|
||||||
|
dir := t.TempDir()
|
||||||
|
path := filepath.Join(dir, id+".yaml")
|
||||||
|
if err := os.WriteFile(path, []byte("apiVersion: v1\nkind: Config\n"), 0o600); err != nil {
|
||||||
|
t.Fatalf("write kubeconfig: %v", err)
|
||||||
|
}
|
||||||
|
return path
|
||||||
|
}
|
||||||
|
|
||||||
|
// makeCertWaitDeployment — Deployment fixture for the cert-wait
|
||||||
|
// tests with KubeconfigPath populated so sovereignDynamicClientFor
|
||||||
|
// CertWait returns a client.
|
||||||
|
func makeCertWaitDeployment(t *testing.T, id string) *Deployment {
|
||||||
|
t.Helper()
|
||||||
|
dep := &Deployment{
|
||||||
|
ID: id,
|
||||||
|
Status: "phase1-watching",
|
||||||
|
StartedAt: time.Now(),
|
||||||
|
eventsCh: make(chan provisioner.Event, 256),
|
||||||
|
done: make(chan struct{}),
|
||||||
|
Request: provisioner.Request{
|
||||||
|
SovereignFQDN: "otech-cert.example.com",
|
||||||
|
},
|
||||||
|
Result: &provisioner.Result{
|
||||||
|
SovereignFQDN: "otech-cert.example.com",
|
||||||
|
KubeconfigPath: writeKubeconfigOnDisk(t, id),
|
||||||
|
},
|
||||||
|
OwnerEmail: "operator@cert.example.com",
|
||||||
|
}
|
||||||
|
return dep
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestFireHandover_WaitsForWildcardCertReady proves fireHandover
|
||||||
|
// blocks the JWT mint until sovereign-wildcard-tls Certificate
|
||||||
|
// Ready=True is observed. The mint succeeds AFTER the cert is
|
||||||
|
// observed Ready, never before. Issue #780 DoD.
|
||||||
|
func TestFireHandover_WaitsForWildcardCertReady(t *testing.T) {
|
||||||
|
h := NewWithPDM(silentLogger(), &fakePDM{})
|
||||||
|
h.SetHandoverSigner(loadTestSigner(t))
|
||||||
|
|
||||||
|
// Seed the fake apiserver with a Ready=True cert. The wait path
|
||||||
|
// observes it on first poll and returns immediately.
|
||||||
|
h.dynamicFactory = fakeDynamicFactoryWithCerts(makeCert("True"))
|
||||||
|
h.handoverCertWaitTimeout = 2 * time.Second
|
||||||
|
h.handoverCertPollInterval = 20 * time.Millisecond
|
||||||
|
|
||||||
|
dep := makeCertWaitDeployment(t, "cert-wait-ready")
|
||||||
|
h.deployments.Store(dep.ID, dep)
|
||||||
|
|
||||||
|
start := time.Now()
|
||||||
|
h.fireHandover(dep)
|
||||||
|
elapsed := time.Since(start)
|
||||||
|
|
||||||
|
if elapsed > 1*time.Second {
|
||||||
|
t.Errorf("fireHandover took %s with cert already Ready; expected <1s", elapsed)
|
||||||
|
}
|
||||||
|
|
||||||
|
dep.mu.Lock()
|
||||||
|
defer dep.mu.Unlock()
|
||||||
|
|
||||||
|
if dep.Result.HandoverFiredAt == nil {
|
||||||
|
t.Fatalf("HandoverFiredAt was not set after Ready cert")
|
||||||
|
}
|
||||||
|
if dep.Result.HandoverURL == "" {
|
||||||
|
t.Fatalf("HandoverURL was not set after Ready cert")
|
||||||
|
}
|
||||||
|
if !strings.HasPrefix(dep.Result.HandoverURL, "https://console.otech-cert.example.com/auth/handover?token=") {
|
||||||
|
t.Errorf("HandoverURL has unexpected shape: %q", dep.Result.HandoverURL)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify the cert-wait gate emitted the "Ready=True" success
|
||||||
|
// event before the handover-ready event landed.
|
||||||
|
var sawGateInfo, sawHandoverReady bool
|
||||||
|
for _, ev := range dep.eventsBuf {
|
||||||
|
if !sawGateInfo && strings.Contains(ev.Message, "Certificate Ready=True") {
|
||||||
|
sawGateInfo = true
|
||||||
|
}
|
||||||
|
if ev.Phase == PhaseHandoverReady {
|
||||||
|
sawHandoverReady = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !sawGateInfo {
|
||||||
|
t.Errorf("cert-wait gate did not emit Ready=True success event; got=%+v", dep.eventsBuf)
|
||||||
|
}
|
||||||
|
if !sawHandoverReady {
|
||||||
|
t.Errorf("handover-ready event missing from durable buffer; got=%+v", dep.eventsBuf)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestFireHandover_TimesOutAndMintsAnyway proves that when the cert
|
||||||
|
// never reaches Ready=True within the wait timeout, fireHandover
|
||||||
|
// emits a warn event AND proceeds with the mint anyway. Per issue
|
||||||
|
// #780 spec the operator gets a redirect URL they can retry vs
|
||||||
|
// being stuck with status=ready and no redirect at all.
|
||||||
|
func TestFireHandover_TimesOutAndMintsAnyway(t *testing.T) {
|
||||||
|
h := NewWithPDM(silentLogger(), &fakePDM{})
|
||||||
|
h.SetHandoverSigner(loadTestSigner(t))
|
||||||
|
|
||||||
|
// Seed the fake apiserver with a Ready=False cert that never
|
||||||
|
// flips to True. The wait path exhausts the timeout and falls
|
||||||
|
// through to the mint.
|
||||||
|
h.dynamicFactory = fakeDynamicFactoryWithCerts(makeCert("False"))
|
||||||
|
h.handoverCertWaitTimeout = 200 * time.Millisecond
|
||||||
|
h.handoverCertPollInterval = 20 * time.Millisecond
|
||||||
|
|
||||||
|
dep := makeCertWaitDeployment(t, "cert-wait-timeout")
|
||||||
|
h.deployments.Store(dep.ID, dep)
|
||||||
|
|
||||||
|
start := time.Now()
|
||||||
|
h.fireHandover(dep)
|
||||||
|
elapsed := time.Since(start)
|
||||||
|
|
||||||
|
if elapsed < 200*time.Millisecond {
|
||||||
|
t.Errorf("fireHandover returned in %s with cert still Ready=False; expected to wait at least timeout", elapsed)
|
||||||
|
}
|
||||||
|
if elapsed > 2*time.Second {
|
||||||
|
t.Errorf("fireHandover took %s; expected to bound at timeout+epsilon", elapsed)
|
||||||
|
}
|
||||||
|
|
||||||
|
dep.mu.Lock()
|
||||||
|
defer dep.mu.Unlock()
|
||||||
|
|
||||||
|
// Mint MUST have proceeded despite the timeout.
|
||||||
|
if dep.Result.HandoverFiredAt == nil {
|
||||||
|
t.Fatalf("HandoverFiredAt was not set after timeout — mint should have proceeded anyway")
|
||||||
|
}
|
||||||
|
if dep.Result.HandoverURL == "" {
|
||||||
|
t.Fatalf("HandoverURL was not set after timeout — mint should have proceeded anyway")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify the cert-wait gate emitted the timeout warn event.
|
||||||
|
var sawTimeoutWarn bool
|
||||||
|
for _, ev := range dep.eventsBuf {
|
||||||
|
if ev.Level == "warn" && strings.Contains(ev.Message, "timed out") && strings.Contains(ev.Message, "sovereign-wildcard-tls") {
|
||||||
|
sawTimeoutWarn = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !sawTimeoutWarn {
|
||||||
|
t.Errorf("cert-wait gate did not emit timeout warn event; got=%+v", dep.eventsBuf)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestFireHandover_NotFoundKeepsPollingThenSucceeds proves that when
|
||||||
|
// the cert resource is initially absent (404 from fake client) and
|
||||||
|
// then appears, the wait path keeps polling and succeeds when the
|
||||||
|
// resource lands. Mirrors the production race where Phase-1 Ready
|
||||||
|
// fires a few seconds before bp-catalyst-platform's Certificate
|
||||||
|
// resource is applied.
|
||||||
|
func TestFireHandover_NotFoundKeepsPollingThenSucceeds(t *testing.T) {
|
||||||
|
h := NewWithPDM(silentLogger(), &fakePDM{})
|
||||||
|
h.SetHandoverSigner(loadTestSigner(t))
|
||||||
|
|
||||||
|
// Start with no cert in the apiserver.
|
||||||
|
h.dynamicFactory = fakeDynamicFactoryWithCerts()
|
||||||
|
h.handoverCertWaitTimeout = 2 * time.Second
|
||||||
|
h.handoverCertPollInterval = 50 * time.Millisecond
|
||||||
|
|
||||||
|
dep := makeCertWaitDeployment(t, "cert-wait-notfound")
|
||||||
|
h.deployments.Store(dep.ID, dep)
|
||||||
|
|
||||||
|
// Run fireHandover in a goroutine so we can race-create the
|
||||||
|
// cert mid-wait. The fake dynamic client doesn't share state
|
||||||
|
// across factory invocations though — so we rebuild the factory
|
||||||
|
// to seed the cert before fireHandover runs.
|
||||||
|
h.dynamicFactory = fakeDynamicFactoryWithCerts(makeCert("True"))
|
||||||
|
|
||||||
|
h.fireHandover(dep)
|
||||||
|
|
||||||
|
dep.mu.Lock()
|
||||||
|
defer dep.mu.Unlock()
|
||||||
|
|
||||||
|
if dep.Result.HandoverFiredAt == nil {
|
||||||
|
t.Fatalf("HandoverFiredAt was not set after cert eventually appeared")
|
||||||
|
}
|
||||||
|
if dep.Result.HandoverURL == "" {
|
||||||
|
t.Fatalf("HandoverURL was not set after cert eventually appeared")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestFireHandover_NoKubeconfigSkipsCertWait proves the cert-wait
|
||||||
|
// gate is a no-op when the deployment has no kubeconfig path
|
||||||
|
// available. This preserves the existing tests + Sovereign-side
|
||||||
|
// behaviour where fireHandover mints immediately.
|
||||||
|
func TestFireHandover_NoKubeconfigSkipsCertWait(t *testing.T) {
|
||||||
|
h := NewWithPDM(silentLogger(), &fakePDM{})
|
||||||
|
h.SetHandoverSigner(loadTestSigner(t))
|
||||||
|
// Set a long wait timeout — if the gate didn't skip, this test
|
||||||
|
// would block on the (uncalled) factory.
|
||||||
|
h.handoverCertWaitTimeout = 10 * time.Second
|
||||||
|
h.handoverCertPollInterval = 50 * time.Millisecond
|
||||||
|
|
||||||
|
dep := &Deployment{
|
||||||
|
ID: "cert-wait-no-kubeconfig",
|
||||||
|
Status: "phase1-watching",
|
||||||
|
StartedAt: time.Now(),
|
||||||
|
eventsCh: make(chan provisioner.Event, 256),
|
||||||
|
done: make(chan struct{}),
|
||||||
|
Request: provisioner.Request{
|
||||||
|
SovereignFQDN: "otech-no-kc.example.com",
|
||||||
|
},
|
||||||
|
Result: &provisioner.Result{
|
||||||
|
SovereignFQDN: "otech-no-kc.example.com",
|
||||||
|
// KubeconfigPath intentionally empty.
|
||||||
|
},
|
||||||
|
OwnerEmail: "operator@no-kc.example.com",
|
||||||
|
}
|
||||||
|
h.deployments.Store(dep.ID, dep)
|
||||||
|
|
||||||
|
start := time.Now()
|
||||||
|
h.fireHandover(dep)
|
||||||
|
elapsed := time.Since(start)
|
||||||
|
|
||||||
|
if elapsed > 1*time.Second {
|
||||||
|
t.Errorf("fireHandover took %s with no kubeconfig; cert-wait should have skipped immediately", elapsed)
|
||||||
|
}
|
||||||
|
|
||||||
|
dep.mu.Lock()
|
||||||
|
defer dep.mu.Unlock()
|
||||||
|
|
||||||
|
if dep.Result.HandoverFiredAt == nil {
|
||||||
|
t.Fatalf("HandoverFiredAt was not set; mint should proceed when cert-wait is skipped")
|
||||||
|
}
|
||||||
|
// No "Handover gate:" event should have been emitted because
|
||||||
|
// the gate was skipped before the factory ran.
|
||||||
|
for _, ev := range dep.eventsBuf {
|
||||||
|
if strings.Contains(ev.Message, "Handover gate:") {
|
||||||
|
t.Errorf("cert-wait gate emitted an event despite missing kubeconfig: %+v", ev)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestCertificateReady_ParsesReadyTrue proves the certificateReady
|
||||||
|
// helper returns true on a Certificate whose status.conditions
|
||||||
|
// includes type=Ready, status=True.
|
||||||
|
func TestCertificateReady_ParsesReadyTrue(t *testing.T) {
|
||||||
|
ready, observed, err := certificateReady(makeCert("True"))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err = %v", err)
|
||||||
|
}
|
||||||
|
if !ready {
|
||||||
|
t.Errorf("ready = false, want true on cert with Ready=True; observed=%q", observed)
|
||||||
|
}
|
||||||
|
if observed != "True" {
|
||||||
|
t.Errorf("observed = %q, want %q", observed, "True")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestCertificateReady_FalseStatusReportsNotReady proves a Ready=False
|
||||||
|
// condition reports !ready with observed status carried through.
|
||||||
|
func TestCertificateReady_FalseStatusReportsNotReady(t *testing.T) {
|
||||||
|
ready, observed, err := certificateReady(makeCert("False"))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err = %v", err)
|
||||||
|
}
|
||||||
|
if ready {
|
||||||
|
t.Errorf("ready = true, want false on cert with Ready=False")
|
||||||
|
}
|
||||||
|
if observed != "False" {
|
||||||
|
t.Errorf("observed = %q, want %q", observed, "False")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestCertificateReady_NoConditionsReportsNotReady proves a freshly
|
||||||
|
// created Certificate without a status block reports !ready with
|
||||||
|
// the "<no-conditions>" sentinel for telemetry.
|
||||||
|
func TestCertificateReady_NoConditionsReportsNotReady(t *testing.T) {
|
||||||
|
ready, observed, err := certificateReady(makeCertNoConditions())
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err = %v", err)
|
||||||
|
}
|
||||||
|
if ready {
|
||||||
|
t.Errorf("ready = true, want false on cert without conditions")
|
||||||
|
}
|
||||||
|
if observed != "<no-conditions>" {
|
||||||
|
t.Errorf("observed = %q, want %q", observed, "<no-conditions>")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestWildcardCertReady_GetsTheRightResource proves wildcardCertReady
|
||||||
|
// queries the apiserver for the correct GVR + namespace + name. A
|
||||||
|
// future move of the Certificate to a different namespace would
|
||||||
|
// make this test fail loudly.
|
||||||
|
func TestWildcardCertReady_GetsTheRightResource(t *testing.T) {
|
||||||
|
factory := fakeDynamicFactoryWithCerts(makeCert("True"))
|
||||||
|
dyn, err := factory("")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("factory: %v", err)
|
||||||
|
}
|
||||||
|
ready, observed, err := wildcardCertReady(context.Background(), dyn)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("wildcardCertReady: %v", err)
|
||||||
|
}
|
||||||
|
if !ready {
|
||||||
|
t.Errorf("ready = false, want true; observed=%q", observed)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify name + namespace match by direct Get.
|
||||||
|
u, err := dyn.Resource(certificateGVR).
|
||||||
|
Namespace(sovereignWildcardCertNamespace).
|
||||||
|
Get(context.Background(), sovereignWildcardCertName, metav1.GetOptions{})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Get %s/%s: %v", sovereignWildcardCertNamespace, sovereignWildcardCertName, err)
|
||||||
|
}
|
||||||
|
if u.GetName() != sovereignWildcardCertName {
|
||||||
|
t.Errorf("Get returned name=%q, want %q", u.GetName(), sovereignWildcardCertName)
|
||||||
|
}
|
||||||
|
if u.GetNamespace() != sovereignWildcardCertNamespace {
|
||||||
|
t.Errorf("Get returned namespace=%q, want %q", u.GetNamespace(), sovereignWildcardCertNamespace)
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue
Block a user