openova/clusters/_template/bootstrap-kit/13-bp-catalyst-platform.yaml
2026-05-05 23:07:04 +04:00

309 lines
17 KiB
YAML

# bp-catalyst-platform — Catalyst Blueprint #13 of 13. The umbrella
# Blueprint that brings up the Catalyst control plane: console, marketplace,
# admin, catalog-svc, projector, provisioning, environment-controller,
# blueprint-controller, billing.
#
# Per docs/ARCHITECTURE.md §11 (Catalyst-on-Catalyst): once this is Ready,
# the Sovereign is fully self-sufficient — sovereign-admin can log into
# console.${SOVEREIGN_FQDN} and proceed with Phase 2 day-1 setup.
#
# Wrapper chart: products/catalyst/chart/
---
apiVersion: v1
kind: Namespace
metadata:
name: catalyst-system
labels:
catalyst.openova.io/sovereign: ${SOVEREIGN_FQDN}
---
apiVersion: source.toolkit.fluxcd.io/v1beta2
kind: HelmRepository
metadata:
name: bp-catalyst-platform
namespace: flux-system
spec:
type: oci
interval: 15m
url: oci://ghcr.io/openova-io
secretRef:
name: ghcr-pull
---
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: bp-catalyst-platform
namespace: flux-system
spec:
interval: 15m
releaseName: catalyst-platform
targetNamespace: catalyst-system
dependsOn:
- name: bp-gitea
# bp-gateway-api (issue #503): umbrella chart ships catalyst-ui +
# catalyst-api HTTPRoute templates; gateway.networking.k8s.io/v1
# CRDs must be registered first.
- name: bp-gateway-api
# bp-keycloak + bp-cnpg (issue #512): the catalyst-platform umbrella
# post-install Jobs bootstrap OIDC clients in Keycloak and seed
# PostgreSQL schemas for catalog-svc / projector / billing /
# provisioning. Both Keycloak and cnpg take 5+ minutes to reach Ready
# on a fresh Sovereign — without an explicit dep, the umbrella's
# hook starts before they're warm and times out at 15m.
# Phase-8a-preflight otech16 (2026-05-02): adding bp-keycloak +
# bp-cnpg here makes Flux wait for both Ready=True before starting
# the umbrella install, eliminating the race.
- name: bp-keycloak
- name: bp-cnpg
chart:
spec:
chart: bp-catalyst-platform
# 1.4.0 (issue #827): adds per-zone wildcard Certificate template.
# When `parentZones` is populated the chart renders one
# cert-manager.io/v1.Certificate per zone in kube-system; the
# Cilium Gateway listeners reference the per-zone Secrets. When
# `parentZones` is empty (legacy single-zone Sovereign) the chart
# falls back to a single Certificate covering `*.<sovereignFQDN>`
# so existing provisioning paths keep working.
# 1.4.1 (PR #839): RBAC dual-mode render fix (Helm + Kustomize).
# 1.4.2 (PR #841): POWERDNS env literal (no envsubst-mid-render).
# 1.4.3 (issue #859): auto-provision sme-pg CNPG Cluster +
# sme-secrets when ingress.marketplace.enabled=true so SME
# services land Ready on a fresh Sovereign without hand-rolled
# SealedSecrets. Catalyst-Zero (contabo) keeps its pre-existing
# clusters/contabo-mkt/apps/sme/data/* manifests — those are
# outside templates/kustomization.yaml's resource list so the
# contabo Kustomize-mode build is unaffected.
# 1.4.4 (issue #861): deploy FerretDB in `sme` ns + cross-ns
# CiliumNetworkPolicy from sme → valkey. Unblocks the 4 SME
# services (catalog, tenant, domain, provisioning) that pin to
# ferretdb.sme.svc.cluster.local for the MongoDB wire and the 2
# services (auth, gateway) that pin to valkey for session/state.
# cnpg-cluster.yaml extended to bootstrap sme_documents (FerretDB
# backing DB) alongside sme_billing.
# 1.4.5 (issue #863): mirror bp-valkey's auto-generated auth
# password from `valkey/valkey` Secret into `sme/sme-valkey-auth`
# via Helm lookup, and wire VALKEY_PASSWORD into auth + gateway
# Deployments. Clears the NOAUTH HELLO crashloop that started
# appearing after 1.4.4 made cross-ns Valkey reachable.
# 1.4.6 (issue #863 follow-up): rebuild chart artifact to bundle
# the rebuilt services-auth + services-gateway image (SHA fa4395f)
# that contains the ConnectValkeyWithAuth Go change. 1.4.5 shipped
# with the OLD image SHA baked in due to a race between the
# blueprint-release pipeline and the services-build deploy step.
# 1.4.7 (issue #866): mirror the gitea-admin password into
# `sme/provisioning-github-token` so the last 1/13 SME pod
# (provisioning) reaches Running 1/1 on a fresh Sovereign,
# completing the SME stack 12/13 → 13/13. Same lookup-and-mirror
# pattern as valkey-cross-ns-secret.yaml (#863).
# 1.4.8 (issue #868): fix marketplace UI PIN-signin — /api/*
# HTTPRoute now backendRefs sme/gateway:8080 (cross-namespace,
# authorised by ReferenceGrant). The previous catalyst-system/
# marketplace-api Service had zero backing Pods, so every signin
# POST 503'd at the gateway. Pairs with services-auth route alias
# /auth/send-pin → SendMagicLink (and /auth/verify-pin →
# VerifyMagicLink) so the UI's PIN-naming reaches the existing
# backend handler.
# 1.4.13 (issue #882): NEW templates/sme-services/sme-tenants-
# kustomization.yaml renders a Flux Kustomization in flux-system
# that watches ./clusters/<sov-fqdn>/sme-tenants — the path the
# catalyst-api SME-tenant orchestrator (sme_tenant_gitops.go)
# commits per-tenant overlays to. Without this, POST
# /api/v1/sme/tenants reached state=done optimistically but no
# K8s resources materialised because nothing reconciled the
# orchestrator's write target. Gated on
# ingress.marketplace.enabled — non-marketplace Sovereigns don't
# run the SME tenant pipeline.
# 1.4.14 (issue #879 follow-up): chart-version-only republish to
# bake catalyst-api image SHA 7bfd6df (the #879 fix commit) into
# values.yaml. 1.4.13 OCI bytes still reference the OLD image SHA
# because the deploy-bot updated values.yaml AFTER the chart was
# published. Same deploy-step race documented in 1.4.6 / 1.4.9 /
# 1.4.12 changelog.
# 1.4.15 (issue #887): auto-provision marketplace-api-secrets
# Secret on Sovereign install. templates/marketplace-api/
# deployment.yaml referenced a secretKeyRef on
# `marketplace-api-secrets` but the chart never rendered the
# Secret — caught live on otech103, marketplace-api in
# CreateContainerConfigError. Fix mirrors sme-secrets/
# valkey-cross-ns-secret/provisioning-github-token Helm-lookup
# persistence pattern. helm.sh/resource-policy: keep.
# 1.4.16 (#893/#889 follow-up): chart-version-only republish to
# bake catalyst-api image SHA 727fb2f (containing the parent-
# kustomization.yaml index + helmrepositories.yaml emit + correct
# per-blueprint sourceRef.name in tenant overlay templates) into
# values.yaml. Without this bump the OCI artifact still references
# the old image and the Sovereign's tenant orchestrator emits
# tenant overlays with stale openova-blueprints sourceRef.
# 1.4.17 (issue #901): unblock Sovereign Console login on every
# fresh provision. 3-bug chain:
# 1. NEW templates/catalyst-openova-kc-credentials-secret.yaml
# auto-mirrors the canonical KC SA Secret (`keycloak/
# catalyst-kc-sa-credentials`) into catalyst-system as
# `catalyst-openova-kc-credentials` with the keys
# api-deployment.yaml's PIN-auth env block expects. Gated on
# `lookup "v1" "Secret" "keycloak" "catalyst-kc-sa-credentials"`
# returning non-nil — renders only on Sovereign, skips on
# contabo (which has its own hand-rolled Secret). Same Helm-
# `lookup` persistence + `helm.sh/resource-policy: keep`
# pattern as templates/marketplace-api/secret.yaml (#887).
# 2. SMTP host/port/from defaults flow through .Values.sovereign.
# smtp.* (mail.openova.io:587 / noreply@openova.io). SMTP
# user/pass mirrored from `catalyst-system/sovereign-smtp-
# credentials` (#883) when present.
# 3. CATALYST_POST_AUTH_REDIRECT default flips from
# /sovereign/wizard (mothership-only) to /sovereign/components
# (post-handover Sovereign homepage). Per-Sovereign overlays
# override via catalystApi.env additional-env patch.
# 1.4.18 (issue #910): NEW templates/sme-services/sme-namespace.yaml
# creates the `sme` namespace on Sovereigns where the marketplace
# is enabled. Without this, chart 1.4.17 install failed 23 times
# with `failed to create resource: namespaces "sme" not found` on
# every fresh franchised Sovereign with marketplace.enabled=true —
# caught live on otech105 (2026-05-05). Same dual-mode contract as
# the rest of templates/sme-services/* (gated on
# ingress.marketplace.enabled, excluded from kustomization.yaml's
# resources: list).
# 1.4.19 (issue #910 — Bugs 2 + 3): unblock Sovereign Console PIN-
# login on a freshly franchised cluster.
# Bug 2: CATALYST_SESSION_COOKIE_DOMAIN literal flips from
# `console.openova.io` to `""` (empty). On a Sovereign the
# request host is console.<sov-fqdn>, so the previous hardcoded
# value made the browser reject Set-Cookie (RFC 6265 §5.3 step 6
# Domain mismatch) and every /api/* request landed without a
# session, redirecting to /login forever. Empty value contract
# (Domain attribute omitted → cookie binds to request host) is
# correct on BOTH Sovereign (console.<sov-fqdn>) AND contabo
# (console.openova.io — wizard + magic-link served from the
# same host). Per-Sovereign overlays MAY override via
# catalystApi.env additional-env patch for unusual topologies.
#
# Bug 3: catalyst-openova-kc-credentials-secret.yaml's smtp-
# user/smtp-pass lookup precedence inverts: SOURCE
# (sovereign-smtp-credentials, seeded by A5's provisioner #883)
# wins over the persisted target. Pre-1.4.19 target-wins meant
# first-install rendered empty SMTP creds, persisted them, and
# NEVER picked up A5's seeded bytes — POST /api/v1/auth/pin/
# issue 502'd `email-send-failed` for the life of the cluster.
# Source-wins makes every Flux 1m reconcile re-read the source.
# KC fields keep "existing target wins" because bp-keycloak
# auto-rotates the client-secret on every Helm upgrade and we
# want that rotation to require explicit operator action
# (delete the target) rather than auto-roll the catalyst-api
# Pod.
# 1.4.20 (#924): Phase-2 SMTP cutover. SOURCE-wins precedence
# extended to (a) non-secret fields smtp-host/smtp-port/smtp-from
# so the per-Sovereign Stalwart relay (`mail.<sovereignFQDN>`)
# takes over from the mothership default (`mail.openova.io`) on
# the next reconcile after slot 95 (bp-stalwart-sovereign) lands,
# and (b) canonical key shape `smtp-user`/`smtp-pass` in addition
# to the legacy `user`/`password` source key shape — the new
# chart writes both shapes, this chart reads either.
# 1.4.22 (#915 SME blockers): six chart + orchestrator fixes
# unblocking alice signup gates 2-6 on franchised Sovereigns —
# issues #934 (auth SMTP empty), #940 (provisioning placeholder
# GITHUB_TOKEN + hardcoded upstream github.com), #941 (catalog
# migrateAppDeployable missing openclaw + stalwart-mail), #942
# (REDPANDA_BROKERS hardcoded to talentmesh — switched to NATS
# JetStream on Sovereigns per ADR-0001), #943 (bp-newapi
# silently skipped Deployment — paired bp-newapi 1.4.0 auto-
# provisions CNPG cluster + credentials Secret), #944 (CRITICAL
# cross-cluster pollution — GIT_BASE_PATH was hardcoded to
# contabo-mkt; chart values now template per-Sovereign with
# provisioning-binary Go-side validation guard refusing commits
# to foreign cluster trees). 2026-05-05.
# 1.4.23: deploy-bot auto-bump (services-auth image SHA roll).
# 1.4.24 (#934 follow-up): smeSecrets.smtp.{host,port,from,user}
# defaults populated with mothership relay (mail.openova.io:587)
# so SME auth Pod's PIN delivery (gate 2) works on Sovereigns
# whose A5-seeded sovereign-smtp-credentials Secret only carries
# smtp-user + smtp-pass without host/port/from. 2026-05-05.
# 1.4.25: deploy-bot auto-bump (sme-services 94ffe01 image roll).
# 1.4.26 (#957 follow-up): catalyst-api-cutover-driver
# ClusterRole gains `create tokenreviews.authentication.k8s.io`
# so /api/v1/internal/cutover/trigger can validate the
# auto-trigger Job's SA token via TokenReview. Without this rule
# every trigger call returned 502 "token-review-failed" on
# otech113 (chart 0.1.18 fixed the readiness loop but exposed
# this missing-RBAC bug as the next failure). 2026-05-05.
# 1.4.29 (#983 follow-up): Sovereign Console URL contract — clean
# root URLs (/dashboard /jobs /cloud …), sovereign_self.go store
# fallback (data renders the moment cutover-import lands without
# waiting for the orchestrator's chart-values overlay write).
# 2026-05-05.
version: 1.4.32
sourceRef:
kind: HelmRepository
name: bp-catalyst-platform
namespace: flux-system
# Event-driven install: umbrella chart deploys ~10 Catalyst services
# (console, marketplace, admin, catalog-svc, projector, provisioning,
# environment-controller, blueprint-controller, billing). Inter-service
# readiness via OTel/NATS subjects is multi-minute and not Helm's
# concern. Replaces PR #221 spec.timeout: 15m.
#
# Issue #910 (otech105 incident, 2026-05-05): 15m was too tight for
# bp-catalyst-platform on a fresh franchised Sovereign with the full
# SME service stack (sme-services + tenant-orchestration + post-install
# secret mirror Jobs). The chart genuinely needs ~20 minutes worst
# case before remediation.retries kicks in. Bumped to 25m
# specifically for this umbrella chart — every other bp-* chart
# remains at its previous (or default) timeout because they install
# in well under 5 minutes empirically.
install:
disableWait: true
timeout: 25m
remediation:
retries: 3
upgrade:
disableWait: true
timeout: 25m
remediation:
retries: 3
# Per-Sovereign overrides for the umbrella — sovereign-FQDN-derived hostnames
# for console/admin/api. All chart-level Catalyst service config (image refs,
# OTel endpoints, NATS subjects) lives in products/catalyst/chart/values.yaml.
values:
global:
sovereignFQDN: ${SOVEREIGN_FQDN}
# sovereignLBIP — Sovereign's load-balancer public IPv4. Issue #900:
# the Day-2 multi-domain add-domain flow uses this to pre-register
# glue records at the customer's registrar before flipping NS.
# Resolved via envsubst from `SOVEREIGN_LB_IP` set in the Sovereign
# cloud-init env (rendered into bootstrap-kit by infra/hetzner from
# hcloud_load_balancer.main.ipv4 — see infra/hetzner/main.tf:274).
# When the Sovereign cloud-init pre-dates #900 the env stays empty
# and the chart renders an empty `lbIP` ConfigMap key — catalyst-api
# then short-circuits the glue registration and falls back to plain
# set_ns (legacy behaviour).
sovereignLBIP: ${SOVEREIGN_LB_IP}
ingress:
hosts:
console:
host: console.${SOVEREIGN_FQDN}
admin:
host: admin.${SOVEREIGN_FQDN}
marketplace:
host: marketplace.${SOVEREIGN_FQDN}
api:
host: api.${SOVEREIGN_FQDN}
# Marketplace mode (issue #710). Toggle to true via envsubst
# MARKETPLACE_ENABLED in the per-Sovereign overlay (catalyst-api
# writes this when the wizard's "Enable Marketplace" component is
# checked). When true, bp-catalyst-platform 1.3.0+ renders the
# marketplace + tenant-wildcard HTTPRoutes and the cross-namespace
# ReferenceGrant.
marketplace:
enabled: ${MARKETPLACE_ENABLED:-false}
# ─── Multi-zone parent domains (issue #827, parent epic #825) ──────
# One wildcard Certificate per parent zone, rendered by chart 1.4.0+
# into kube-system. Each cert renews independently; a stalled
# DNS-01 challenge on one zone never blocks another zone's renewal.
# Source of truth is the same ${PARENT_DOMAINS_YAML} variable used
# by bootstrap-kit slot 11 (bp-powerdns) so the two slots stay in
# lockstep on what the Sovereign considers a parent zone.
# When the operator brings only one parent domain (default
# zero-touch flow), cloud-init pre-renders this variable to a
# single-entry array derived from ${sovereign_fqdn}.
parentZones: ${PARENT_DOMAINS_YAML}