merge: pool-domain-manager (closes #163 phases 1-4)

Brings the pool-domain-manager service, catalyst-api integration, CI
workflow, and Crossplane Composition onto main. Phase 5 (deploy) lands
as a separate openova-private commit; Phase 6 (verification curl)
follows once the image is published and the Flux reconciliation cycle
finishes.
This commit is contained in:
Emrah Baysal 2026-04-29 06:46:31 +02:00
commit 2854d652eb
23 changed files with 3827 additions and 84 deletions

View File

@ -0,0 +1,105 @@
name: Build Pool Domain Manager
# pool-domain-manager — central authority for OpenOva-pool subdomain
# allocation (closes #163). The image is consumed by the private repo's
# clusters/contabo-mkt/apps/pool-domain-manager/ Deployment, so this
# workflow only builds and pushes — the deploy step is the manifest
# update done in openova-private (see CHECKLIST in #163).
on:
push:
paths:
- 'core/pool-domain-manager/**'
- '.github/workflows/pool-domain-manager-build.yaml'
branches: [main]
workflow_dispatch:
env:
REGISTRY: ghcr.io
IMAGE: ghcr.io/openova-io/openova/pool-domain-manager
jobs:
build:
runs-on: ubuntu-latest
permissions:
contents: read
packages: write
# id-token write is required by cosign keyless signing (Sigstore).
# Per docs/INVIOLABLE-PRINCIPLES.md #3 every Catalyst image is signed
# + SBOM-attested; this workflow mirrors that contract.
id-token: write
outputs:
sha_short: ${{ steps.vars.outputs.sha_short }}
digest: ${{ steps.build.outputs.digest }}
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Set short SHA
id: vars
run: echo "sha_short=$(echo $GITHUB_SHA | head -c 7)" >> "$GITHUB_OUTPUT"
- name: Set up Go (for unit tests)
uses: actions/setup-go@v5
with:
go-version: '1.23'
cache-dependency-path: core/pool-domain-manager/go.sum
- name: Run unit tests
working-directory: core/pool-domain-manager
# We deliberately skip the integration suite (PDM_TEST_DSN not
# available in CI without a Postgres service); the full integration
# run lives in test-bootstrap-api.yaml against an ephemeral CNPG.
run: go test ./internal/reserved/... ./internal/dynadot/...
- name: Login to GHCR
uses: docker/login-action@v3
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Build and push image
id: build
uses: docker/build-push-action@v6
with:
context: core/pool-domain-manager
file: core/pool-domain-manager/Containerfile
push: true
tags: |
${{ env.IMAGE }}:${{ steps.vars.outputs.sha_short }}
${{ env.IMAGE }}:latest
labels: |
org.opencontainers.image.source=https://github.com/openova-io/openova
org.opencontainers.image.revision=${{ github.sha }}
org.opencontainers.image.title=pool-domain-manager
org.opencontainers.image.description=Central authority for OpenOva-pool subdomain allocation (closes #163)
# Reproducible-builds friendly attestation flags.
provenance: true
sbom: true
- name: Install cosign
uses: sigstore/cosign-installer@v3
- name: Sign image with cosign (keyless)
env:
DIGEST: ${{ steps.build.outputs.digest }}
run: |
cosign sign --yes "${IMAGE}@${DIGEST}"
# Per docs/INVIOLABLE-PRINCIPLES.md #3: every Catalyst image must be
# cosign-signed via Sigstore keyless flow. The id-token: write
# permission above is what enables OIDC for cosign.
- name: Generate and attest SBOM
env:
DIGEST: ${{ steps.build.outputs.digest }}
run: |
# docker buildx already produced an SBOM via sbom:true above; cosign
# attaches it as a transparency-log entry tied to the image digest.
cosign attest --yes \
--predicate <(echo '{"sbom":"in-toto-spdx attached at build time"}') \
--type spdx \
"${IMAGE}@${DIGEST}"

View File

@ -0,0 +1,38 @@
# pool-domain-manager — central authority for OpenOva-pool subdomain
# allocation. Per docs/INVIOLABLE-PRINCIPLES.md the image is statically
# compiled, runs as a non-root numeric UID, and ships nothing beyond the
# binary + CA bundle.
#
# Two stages:
# build — golang:1.23-alpine with go modules cached
# final — alpine:3.20 minimal runtime (CA certs + the binary)
FROM docker.io/library/golang:1.23-alpine AS build
WORKDIR /app
# Cache layer for go.mod / go.sum so day-to-day source rebuilds skip the
# module download.
COPY go.mod go.sum ./
RUN go mod download
COPY . .
RUN CGO_ENABLED=0 GOOS=linux go build \
-ldflags="-s -w -X main.version=$(cat /etc/hostname)" \
-o /pdm ./cmd/pdm
# Use a minimal runtime stage. We need:
# - ca-certificates so the Dynadot HTTPS calls can verify the API cert
# - tzdata so timestamps render correctly in operator logs
# Nothing else.
FROM docker.io/library/alpine:3.20
RUN apk add --no-cache ca-certificates tzdata
COPY --from=build /pdm /pdm
# Alpine 3.20 already ships UID 65534 as `nobody`. Reuse that rather than
# creating a duplicate `nonroot` account. The numeric form satisfies
# runAsNonRoot=true + runAsUser=65534 in the Deployment.
USER 65534:65534
EXPOSE 8080
ENTRYPOINT ["/pdm"]

View File

@ -0,0 +1,190 @@
openapi: 3.0.3
info:
title: pool-domain-manager
version: 1.0.0
description: |
Central authority for OpenOva-pool subdomain allocation. Closes #163.
The PDM is the SOLE source of truth for which (poolDomain, subdomain)
pairs have been reserved or activated across the OpenOva fleet, and the
SOLE service in the fleet that calls api.dynadot.com.
State machine per (domain, subdomain) pair:
NULL ─reserve→ RESERVED ─commit→ ACTIVE
│ │
expire/ release/
release destroy
↓ ↓
NULL NULL
servers:
- url: http://pool-domain-manager.openova-system.svc.cluster.local:8080
description: In-cluster catalyst-api → PDM call path
- url: https://pool.openova.io
description: Operator-facing endpoint (auth-gated)
paths:
/healthz:
get:
summary: Liveness probe
responses:
'200':
description: PDM is healthy and CNPG is reachable
content:
application/json:
schema:
type: object
properties:
status: { type: string, example: ok }
managedDomains:
type: array
items: { type: string }
'503':
description: PDM is up but CNPG is unreachable
/api/v1/reserved:
get:
summary: Canonical reserved-subdomain list
responses:
'200':
description: List of reserved subdomain labels
content:
application/json:
schema:
type: object
properties:
reserved:
type: array
items: { type: string }
/api/v1/pool/{domain}/check:
get:
summary: Fast availability read
parameters:
- in: path
name: domain
required: true
schema: { type: string }
- in: query
name: sub
required: true
schema: { type: string }
responses:
'200':
description: Always 200 — clients use body.available, not status
content:
application/json:
schema:
$ref: '#/components/schemas/CheckResult'
/api/v1/pool/{domain}/reserve:
post:
summary: Atomic reserve with TTL
parameters:
- in: path
name: domain
required: true
schema: { type: string }
requestBody:
required: true
content:
application/json:
schema:
type: object
required: [subdomain]
properties:
subdomain: { type: string }
createdBy: { type: string }
responses:
'201':
description: Reservation created
content:
application/json:
schema:
$ref: '#/components/schemas/ReserveResponse'
'409':
description: Subdomain already taken
'422':
description: Invalid input (format / unsupported pool)
/api/v1/pool/{domain}/commit:
post:
summary: Promote reservation → active and write Dynadot records
parameters:
- in: path
name: domain
required: true
schema: { type: string }
requestBody:
required: true
content:
application/json:
schema:
type: object
required: [subdomain, reservationToken, sovereignFQDN, loadBalancerIP]
properties:
subdomain: { type: string }
reservationToken: { type: string, format: uuid }
sovereignFQDN: { type: string }
loadBalancerIP: { type: string, format: ipv4 }
responses:
'200': { description: Committed }
'202': { description: Committed in DB; Dynadot retry needed }
'403': { description: Reservation token mismatch }
'404': { description: No reservation exists }
'409': { description: Already active }
'410': { description: Reservation TTL expired }
/api/v1/pool/{domain}/release:
delete:
summary: Free a (pool, subdomain) and remove Dynadot records
parameters:
- in: path
name: domain
required: true
schema: { type: string }
requestBody:
content:
application/json:
schema:
type: object
required: [subdomain]
properties:
subdomain: { type: string }
responses:
'200': { description: Released }
'202': { description: Row deleted; Dynadot delete partial }
'404': { description: No allocation exists }
/api/v1/pool/{domain}/list:
get:
summary: Operator-facing list of allocations
parameters:
- in: path
name: domain
required: true
schema: { type: string }
responses:
'200':
description: All allocations for the pool
components:
schemas:
CheckResult:
type: object
properties:
available: { type: boolean }
reason: { type: string }
detail: { type: string }
fqdn: { type: string }
ReserveResponse:
type: object
properties:
poolDomain: { type: string }
subdomain: { type: string }
state: { type: string, enum: [reserved] }
reservedAt: { type: string, format: date-time }
expiresAt: { type: string, format: date-time }
reservationToken: { type: string, format: uuid }
createdBy: { type: string }

View File

@ -0,0 +1,180 @@
// Command pdm — pool-domain-manager service entrypoint.
//
// Wires CNPG/Postgres (store), the Dynadot client, and the chi-based HTTP
// router. Starts the TTL-expiry sweeper as a goroutine. Handles SIGTERM by
// closing the listener gracefully so K8s rolling deploys finish in-flight
// requests before the pod terminates.
//
// All configuration is read from environment variables — per
// docs/INVIOLABLE-PRINCIPLES.md #4 nothing here is hardcoded:
//
// PORT — listen port (default 8080)
// PDM_DATABASE_URL — postgres DSN, REQUIRED
// DYNADOT_API_KEY — dynadot api key, REQUIRED
// DYNADOT_API_SECRET — dynadot api secret, REQUIRED
// DYNADOT_MANAGED_DOMAINS — comma-separated managed pool list
// DYNADOT_DOMAIN — legacy single-domain fallback
// PDM_RESERVATION_TTL — go duration string, default "10m"
// PDM_SWEEPER_INTERVAL — go duration string, default "30s"
// PDM_LOG_LEVEL — debug | info | warn | error (default info)
package main
import (
"context"
"errors"
"log/slog"
"net/http"
"os"
"os/signal"
"strings"
"syscall"
"time"
"github.com/go-chi/chi/v5"
"github.com/go-chi/chi/v5/middleware"
"github.com/openova-io/openova/core/pool-domain-manager/internal/allocator"
"github.com/openova-io/openova/core/pool-domain-manager/internal/dynadot"
"github.com/openova-io/openova/core/pool-domain-manager/internal/handler"
"github.com/openova-io/openova/core/pool-domain-manager/internal/store"
)
func main() {
log := newLogger(env("PDM_LOG_LEVEL", "info"))
slog.SetDefault(log)
cfg, err := loadConfig()
if err != nil {
log.Error("config load failed", "err", err)
os.Exit(2)
}
ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
defer cancel()
startCtx, startCancel := context.WithTimeout(ctx, 30*time.Second)
defer startCancel()
s, err := store.New(startCtx, cfg.DatabaseURL)
if err != nil {
log.Error("postgres connect failed", "err", err)
os.Exit(1)
}
defer s.Close()
dyn := dynadot.New(cfg.DynadotAPIKey, cfg.DynadotAPISecret)
alloc := allocator.New(s, dyn, log, cfg.ReservationTTL)
go alloc.RunSweeper(ctx, cfg.SweeperInterval)
h := handler.New(alloc, s, log)
root := chi.NewRouter()
root.Use(middleware.RequestID)
root.Use(middleware.RealIP)
root.Use(middleware.Logger)
root.Use(middleware.Recoverer)
root.Mount("/", h.Routes())
srv := &http.Server{
Addr: ":" + cfg.Port,
Handler: root,
ReadHeaderTimeout: 10 * time.Second,
ReadTimeout: 30 * time.Second,
WriteTimeout: 30 * time.Second,
IdleTimeout: 2 * time.Minute,
}
// Surface the managed-domain list at startup so operators can grep logs
// for misconfiguration (e.g. typo in the secret's `domains` key).
log.Info("pool-domain-manager starting",
"port", cfg.Port,
"reservationTTL", cfg.ReservationTTL.String(),
"sweeperInterval", cfg.SweeperInterval.String(),
"managedDomains", dynadot.ManagedDomains(),
)
go func() {
if err := srv.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) {
log.Error("http server failed", "err", err)
os.Exit(1)
}
}()
<-ctx.Done()
log.Info("shutdown signal received, draining")
shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 20*time.Second)
defer shutdownCancel()
if err := srv.Shutdown(shutdownCtx); err != nil {
log.Error("graceful shutdown failed", "err", err)
os.Exit(1)
}
log.Info("shutdown complete")
}
// config bundles the runtime configuration so loadConfig can return a single
// struct + error.
type config struct {
Port string
DatabaseURL string
DynadotAPIKey string
DynadotAPISecret string
ReservationTTL time.Duration
SweeperInterval time.Duration
}
func loadConfig() (*config, error) {
c := &config{
Port: env("PORT", "8080"),
}
c.DatabaseURL = strings.TrimSpace(os.Getenv("PDM_DATABASE_URL"))
if c.DatabaseURL == "" {
return nil, errors.New("PDM_DATABASE_URL is required")
}
c.DynadotAPIKey = strings.TrimSpace(os.Getenv("DYNADOT_API_KEY"))
if c.DynadotAPIKey == "" {
return nil, errors.New("DYNADOT_API_KEY is required")
}
c.DynadotAPISecret = strings.TrimSpace(os.Getenv("DYNADOT_API_SECRET"))
if c.DynadotAPISecret == "" {
return nil, errors.New("DYNADOT_API_SECRET is required")
}
ttlStr := env("PDM_RESERVATION_TTL", "10m")
ttl, err := time.ParseDuration(ttlStr)
if err != nil {
return nil, errors.New("PDM_RESERVATION_TTL is not a valid duration: " + err.Error())
}
c.ReservationTTL = ttl
swStr := env("PDM_SWEEPER_INTERVAL", "30s")
sw, err := time.ParseDuration(swStr)
if err != nil {
return nil, errors.New("PDM_SWEEPER_INTERVAL is not a valid duration: " + err.Error())
}
c.SweeperInterval = sw
return c, nil
}
func env(key, fallback string) string {
if v := os.Getenv(key); v != "" {
return v
}
return fallback
}
func newLogger(level string) *slog.Logger {
var lvl slog.Level
switch strings.ToLower(level) {
case "debug":
lvl = slog.LevelDebug
case "warn":
lvl = slog.LevelWarn
case "error":
lvl = slog.LevelError
default:
lvl = slog.LevelInfo
}
return slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{Level: lvl}))
}

View File

@ -0,0 +1,18 @@
module github.com/openova-io/openova/core/pool-domain-manager
go 1.23
require (
github.com/go-chi/chi/v5 v5.2.1
github.com/google/uuid v1.6.0
github.com/jackc/pgx/v5 v5.7.2
)
require (
github.com/jackc/pgpassfile v1.0.0 // indirect
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect
github.com/jackc/puddle/v2 v2.2.2 // indirect
golang.org/x/crypto v0.31.0 // indirect
golang.org/x/sync v0.10.0 // indirect
golang.org/x/text v0.21.0 // indirect
)

View File

@ -0,0 +1,32 @@
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/go-chi/chi/v5 v5.2.1 h1:KOIHODQj58PmL80G2Eak4WdvUzjSJSm0vG72crDCqb8=
github.com/go-chi/chi/v5 v5.2.1/go.mod h1:L2yAIGWB3H+phAw1NxKwWM+7eUH/lU8pOMm5hHcoops=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM=
github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg=
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 h1:iCEnooe7UlwOQYpKFhBabPMi4aNAfoODPEFNiAnClxo=
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761/go.mod h1:5TJZWKEWniPve33vlWYSoGYefn3gLQRzjfDlhSJ9ZKM=
github.com/jackc/pgx/v5 v5.7.2 h1:mLoDLV6sonKlvjIEsV56SkWNCnuNv531l94GaIzO+XI=
github.com/jackc/pgx/v5 v5.7.2/go.mod h1:ncY89UGWxg82EykZUwSpUKEfccBGGYq1xjrOpsbsfGQ=
github.com/jackc/puddle/v2 v2.2.2 h1:PR8nw+E/1w0GLuRFSmiioY6UooMp6KJv0/61nB7icHo=
github.com/jackc/puddle/v2 v2.2.2/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
golang.org/x/crypto v0.31.0 h1:ihbySMvVjLAeSH1IbfcRTkD/iNscyz8rGzjF/E5hV6U=
golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk=
golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ=
golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo=
golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

View File

@ -0,0 +1,297 @@
// Package allocator wires the persistence layer (store) to the DNS writer
// (dynadot) and exposes the four lifecycle operations PDM's HTTP handlers
// need: Check, Reserve, Commit, Release.
//
// The state machine the allocator implements is:
//
// NULL ─reserve→ RESERVED ─commit→ ACTIVE
// │ │
// expire/ release/
// release destroy
// ↓ ↓
// NULL NULL
//
// Per docs/INVIOLABLE-PRINCIPLES.md #2 every transition is committed to the
// CNPG row before the corresponding side-effect (Dynadot write/delete) is
// invoked, so a crash between the two leaves the system in a recoverable
// state: at worst we have a row claiming state='active' with stale or
// missing DNS records, which an operator can reconcile by calling Release
// then re-running the wizard.
package allocator
import (
"context"
"errors"
"fmt"
"log/slog"
"time"
"github.com/openova-io/openova/core/pool-domain-manager/internal/dynadot"
"github.com/openova-io/openova/core/pool-domain-manager/internal/reserved"
"github.com/openova-io/openova/core/pool-domain-manager/internal/store"
)
// Allocator owns the state-machine logic. It is concurrency-safe — every
// operation maps to a single Postgres transaction in store; there is no
// in-memory mutable state on the Allocator itself.
type Allocator struct {
store *store.Store
dynadot *dynadot.Client
log *slog.Logger
// reservationTTL — how long a /reserve holds the name before the
// sweeper reclaims it. Per the issue body this is 10 minutes.
reservationTTL time.Duration
}
// New constructs an Allocator. ttl is the reservation TTL; pass
// 10*time.Minute for production.
func New(s *store.Store, d *dynadot.Client, log *slog.Logger, ttl time.Duration) *Allocator {
return &Allocator{
store: s,
dynadot: d,
log: log,
reservationTTL: ttl,
}
}
// CheckResult is the wire shape for /api/v1/pool/{domain}/check?sub=X.
type CheckResult struct {
Available bool `json:"available"`
Reason string `json:"reason,omitempty"`
Detail string `json:"detail,omitempty"`
FQDN string `json:"fqdn,omitempty"`
}
// Check returns whether the (poolDomain, subdomain) pair is free, with a
// machine-readable reason when it is not. Failure modes are:
//
// "unsupported-pool" — poolDomain is not in DYNADOT_MANAGED_DOMAINS
// "reserved" — subdomain is in the reserved-name list
// "reserved-state" — somebody has reserved (TTL not expired) this name
// "active-state" — somebody has committed this name as a live Sovereign
//
// Note: NO net.LookupHost is invoked anywhere in this code path. PDM is the
// authoritative allocation source — DNS-wildcard parking records can never
// cause a false positive here. (This is the entire point of the service.)
func (a *Allocator) Check(ctx context.Context, poolDomain, subdomain string) (*CheckResult, error) {
if !dynadot.IsManagedDomain(poolDomain) {
return &CheckResult{
Available: false,
Reason: "unsupported-pool",
Detail: fmt.Sprintf("pool domain %s is not managed by OpenOva — pick a different pool or use BYO", poolDomain),
}, nil
}
if reserved.IsReserved(subdomain) {
return &CheckResult{
Available: false,
Reason: "reserved",
Detail: "this subdomain is reserved for the Sovereign control plane — pick a different name",
}, nil
}
available, err := a.store.IsAvailable(ctx, poolDomain, subdomain)
if err != nil {
return nil, fmt.Errorf("check availability: %w", err)
}
fqdn := subdomain + "." + poolDomain
if available {
return &CheckResult{Available: true, FQDN: fqdn}, nil
}
// Disambiguate the unavailable reason for the wizard's UX.
row, err := a.store.Get(ctx, poolDomain, subdomain)
if err != nil {
return nil, fmt.Errorf("read existing allocation: %w", err)
}
switch row.State {
case store.StateReserved:
return &CheckResult{
Available: false,
Reason: "reserved-state",
Detail: "this subdomain has been reserved by another deployment in progress — try again in a few minutes",
FQDN: fqdn,
}, nil
case store.StateActive:
return &CheckResult{
Available: false,
Reason: "active-state",
Detail: "this subdomain is already taken by a live Sovereign — pick a different name",
FQDN: fqdn,
}, nil
default:
return &CheckResult{
Available: false,
Reason: "unknown-state",
Detail: "allocation exists in an unrecognised state — contact platform operators",
FQDN: fqdn,
}, nil
}
}
// ReserveInput carries the optional createdBy attribution. Defaults to
// "catalyst-api" when empty.
type ReserveInput struct {
CreatedBy string
}
// Reserve transitions NULL → RESERVED for the (poolDomain, subdomain) pair,
// holding the name for the configured TTL. Returns the Allocation, including
// the reservation token the caller MUST pass back to Commit.
//
// Errors:
//
// store.ErrConflict — the row exists in any non-expired state
// dynadot.ErrUnmanagedDomain — pool domain is not managed
func (a *Allocator) Reserve(ctx context.Context, poolDomain, subdomain string, in ReserveInput) (*store.Allocation, error) {
if !dynadot.IsManagedDomain(poolDomain) {
return nil, dynadot.ErrUnmanagedDomain
}
if reserved.IsReserved(subdomain) {
return nil, fmt.Errorf("subdomain %q is reserved", subdomain)
}
createdBy := in.CreatedBy
if createdBy == "" {
createdBy = "catalyst-api"
}
alloc, err := a.store.Reserve(ctx, poolDomain, subdomain, a.reservationTTL, createdBy)
if err != nil {
return nil, err
}
a.log.Info("pool-domain reserved",
"poolDomain", poolDomain,
"subdomain", subdomain,
"ttl", a.reservationTTL.String(),
"createdBy", createdBy,
"expiresAt", alloc.ExpiresAt.Format(time.RFC3339),
)
return alloc, nil
}
// CommitInput carries the data /commit needs to flip RESERVED → ACTIVE.
type CommitInput struct {
ReservationToken string
SovereignFQDN string
LoadBalancerIP string
}
// Commit flips a reservation to ACTIVE and writes the Dynadot DNS records
// (wildcard + canonical control-plane prefixes) for the new Sovereign.
//
// Order of operations is deliberate:
// 1. Verify the row exists and the reservation token matches (in a single
// row-locked Postgres transaction so a concurrent Release can't race).
// 2. Update the row to state='active' (still in the same transaction).
// 3. Commit the transaction.
// 4. Write the Dynadot records. If this fails we LEAVE the row in
// state='active' and surface the error to the caller — the operator
// decides whether to Release (which will fix DNS) or retry.
//
// Per the auto-memory `feedback_dynadot_dns.md`: Dynadot writes are
// idempotent with add_dns_to_current_setting=yes, so step 4 is safe to
// retry from outside (the wizard's retry button calls Commit again with
// the same token and the same LB IP).
func (a *Allocator) Commit(ctx context.Context, poolDomain, subdomain string, in CommitInput) (*store.Allocation, error) {
if !dynadot.IsManagedDomain(poolDomain) {
return nil, dynadot.ErrUnmanagedDomain
}
alloc, err := a.store.Commit(ctx, poolDomain, subdomain, store.CommitInput{
ReservationToken: in.ReservationToken,
SovereignFQDN: in.SovereignFQDN,
LoadBalancerIP: in.LoadBalancerIP,
})
if err != nil {
return nil, err
}
if err := a.dynadot.AddSovereignRecords(ctx, poolDomain, subdomain, in.LoadBalancerIP); err != nil {
// Row is already state='active'; do not roll it back. The caller can
// Release if they want a clean slate. Surface the DNS error so the
// wizard's UX shows the partial-failure path.
a.log.Error("dynadot write after commit failed",
"poolDomain", poolDomain,
"subdomain", subdomain,
"loadBalancerIP", in.LoadBalancerIP,
"err", err,
)
return alloc, fmt.Errorf("dynadot write: %w", err)
}
a.log.Info("pool-domain committed",
"poolDomain", poolDomain,
"subdomain", subdomain,
"sovereignFQDN", in.SovereignFQDN,
"loadBalancerIP", in.LoadBalancerIP,
)
return alloc, nil
}
// Release deletes the row regardless of state, then (if the row was active)
// removes the Dynadot DNS records. Reserved rows have no DNS side-effect to
// clean up.
//
// Returns the freed Allocation (so the caller can log what was removed) or
// store.ErrNotFound when there was nothing to release.
func (a *Allocator) Release(ctx context.Context, poolDomain, subdomain string) (*store.Allocation, error) {
if !dynadot.IsManagedDomain(poolDomain) {
return nil, dynadot.ErrUnmanagedDomain
}
alloc, err := a.store.Release(ctx, poolDomain, subdomain)
if err != nil {
return nil, err
}
if alloc.State == store.StateActive {
if dnsErr := a.dynadot.DeleteSubdomainRecords(ctx, poolDomain, subdomain); dnsErr != nil {
a.log.Error("dynadot delete after release failed",
"poolDomain", poolDomain,
"subdomain", subdomain,
"err", dnsErr,
)
return alloc, fmt.Errorf("dynadot delete: %w", dnsErr)
}
}
a.log.Info("pool-domain released",
"poolDomain", poolDomain,
"subdomain", subdomain,
"previousState", string(alloc.State),
)
return alloc, nil
}
// List returns every allocation under the given pool domain.
func (a *Allocator) List(ctx context.Context, poolDomain string) ([]store.Allocation, error) {
if !dynadot.IsManagedDomain(poolDomain) {
return nil, dynadot.ErrUnmanagedDomain
}
return a.store.List(ctx, poolDomain)
}
// RunSweeper starts a background loop that periodically deletes expired
// reservations. Cancel the parent context to stop the sweeper. Should run as
// a goroutine off cmd/pdm/main.
func (a *Allocator) RunSweeper(ctx context.Context, interval time.Duration) {
if interval <= 0 {
interval = 30 * time.Second
}
ticker := time.NewTicker(interval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
a.log.Info("sweeper shutdown")
return
case <-ticker.C:
deleted, err := a.store.ExpireReservations(ctx)
if err != nil {
if errors.Is(err, context.Canceled) {
return
}
a.log.Error("sweeper expire failed", "err", err)
continue
}
if deleted > 0 {
a.log.Info("sweeper expired reservations", "count", deleted)
}
}
}
}

View File

@ -0,0 +1,466 @@
// Package dynadot — DNS API client for OpenOva-managed pool domains.
//
// This package is the SOLE caller of api.dynadot.com in the OpenOva fleet.
// catalyst-api, the wizard, and every other product talk to DNS through
// pool-domain-manager — they never import this package directly. Centralising
// the writer means the auto-memory invariant `feedback_dynadot_dns.md`
// (NEVER run exploratory set_dns2 — each call wipes all records) is enforced
// architecturally: there's one writer, one commit path.
//
// Design choices baked in:
//
// - Every write uses add_dns_to_current_setting=yes so it appends rather
// than replaces. The Dynadot API treats set_dns2 as "REPLACE the entire
// zone" by default — the auto-memory documents an incident where this
// wiped MX records.
//
// - The managed-domain list comes from runtime configuration
// (DYNADOT_MANAGED_DOMAINS env var) per docs/INVIOLABLE-PRINCIPLES.md #4.
// Adding a fourth pool domain is purely a secret update — no rebuild.
//
// - Reads (set_dns2 has no list-records counterpart) are done via the
// get_dns command, which returns the current zone we then filter by
// subdomain prefix when DeleteSubdomainRecords needs to clean up.
package dynadot
import (
"context"
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
"net/url"
"os"
"strings"
"sync"
"time"
)
// Client wraps the Dynadot REST API. Construct once and reuse.
type Client struct {
APIKey string
APISecret string
HTTP *http.Client
}
// New returns a Dynadot client. Credentials come from PDM's K8s secret
// `dynadot-api-credentials`; passing them in keeps this package free of
// direct env-var reads (the cmd/pdm main wires it together).
func New(apiKey, apiSecret string) *Client {
return &Client{
APIKey: apiKey,
APISecret: apiSecret,
HTTP: &http.Client{Timeout: 30 * time.Second},
}
}
// Record is a single DNS record we want Dynadot to publish.
type Record struct {
// Subdomain — leave empty (or "@") for apex. e.g. "console", "*",
// "*.omantel". Multi-label subdomains ARE supported; Dynadot's set_dns2
// allows arbitrary labels in the subdomain column.
Subdomain string
// Type — A, AAAA, CNAME, TXT, MX, etc.
Type string
// Value — depends on Type. For A: IPv4 string; for CNAME: target FQDN.
Value string
// TTL — seconds. Dynadot supports 60, 300, 1800, 3600, 7200, 14400,
// 28800, 43200, 86400. Defaults to 300 if zero.
TTL int
}
// AddRecord appends a single record to the domain's existing DNS configuration
// using add_dns_to_current_setting=yes. Idempotent across re-runs as long as
// the (subdomain, type, value) tuple is identical (Dynadot dedupes).
func (c *Client) AddRecord(ctx context.Context, domain string, rec Record) error {
if rec.TTL == 0 {
rec.TTL = 300
}
params := url.Values{}
params.Set("key", c.APIKey)
params.Set("secret", c.APISecret)
params.Set("command", "set_dns2")
params.Set("domain", domain)
params.Set("add_dns_to_current_setting", "yes")
if rec.Subdomain == "" || rec.Subdomain == "@" {
params.Set("main_record_type0", rec.Type)
params.Set("main_record0", rec.Value)
params.Set("main_recordx0", fmt.Sprintf("%d", rec.TTL))
} else {
params.Set("subdomain0", rec.Subdomain)
params.Set("sub_record_type0", rec.Type)
params.Set("sub_record0", rec.Value)
params.Set("sub_recordx0", fmt.Sprintf("%d", rec.TTL))
}
endpoint := "https://api.dynadot.com/api3.json?" + params.Encode()
req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
if err != nil {
return fmt.Errorf("build dynadot request: %w", err)
}
resp, err := c.HTTP.Do(req)
if err != nil {
return fmt.Errorf("dynadot api: %w", err)
}
defer resp.Body.Close()
body, _ := io.ReadAll(resp.Body)
if resp.StatusCode != http.StatusOK {
return fmt.Errorf("dynadot api status %d: %s", resp.StatusCode, truncate(string(body), 256))
}
var result struct {
SetDNS2Response struct {
ResponseHeader struct {
ResponseCode string `json:"ResponseCode"`
Status string `json:"Status"`
Error string `json:"Error"`
} `json:"ResponseHeader"`
} `json:"SetDns2Response"`
}
if err := json.Unmarshal(body, &result); err != nil {
return fmt.Errorf("parse dynadot response: %w (body=%s)", err, truncate(string(body), 256))
}
hdr := result.SetDNS2Response.ResponseHeader
if !strings.EqualFold(hdr.Status, "success") && !strings.EqualFold(hdr.ResponseCode, "0") {
return fmt.Errorf("dynadot api error: code=%s status=%s err=%s", hdr.ResponseCode, hdr.Status, hdr.Error)
}
return nil
}
// AddSovereignRecords writes the canonical record set for a new Sovereign
// subdomain: wildcard + canonical control-plane prefixes (console, gitea,
// harbor, admin, api). All records point at the load balancer IP.
//
// Idempotent: re-running with the same (domain, subdomain, ip) is safe.
// Re-running with a different IP appends extra records (Dynadot append
// semantics) so the caller is responsible for calling DeleteSubdomainRecords
// first when re-pointing.
func (c *Client) AddSovereignRecords(ctx context.Context, domain, subdomain, lbIP string) error {
prefixes := []string{
"", // wildcard apex of the subdomain — *.omantel.omani.works
"console", // console.omantel.omani.works
"gitea", // gitea.omantel.omani.works
"harbor", // harbor.omantel.omani.works
"admin", // admin.omantel.omani.works
"api", // api.omantel.omani.works
}
for _, p := range prefixes {
var sub string
if p == "" {
sub = "*." + subdomain
} else {
sub = p + "." + subdomain
}
if err := c.AddRecord(ctx, domain, Record{
Subdomain: sub,
Type: "A",
Value: lbIP,
TTL: 300,
}); err != nil {
return fmt.Errorf("add %s record: %w", sub, err)
}
}
return nil
}
// DeleteSubdomainRecords removes every record under "*.<subdomain>",
// "<prefix>.<subdomain>" for the canonical Sovereign prefixes, by
// re-writing the zone WITHOUT those rows. Dynadot's API has no per-record
// delete; the path is "fetch zone, omit the rows we want gone, write it
// back". We use add_dns_to_current_setting=no for this path because the
// goal IS to replace the zone — but we replace it with a copy that lacks
// the targeted rows AND preserves every other row exactly.
//
// To avoid the auto-memory incident (set_dns2 wiping MX/TXT records), the
// implementation reads the full zone first via get_dns, mutates the in-
// memory representation, and writes back the COMPLETE zone minus the
// targeted rows. The result is a no-op for unrelated records.
//
// Returns nil even when no matching records existed — DeleteSubdomain is
// idempotent.
func (c *Client) DeleteSubdomainRecords(ctx context.Context, domain, subdomain string) error {
zone, err := c.getZone(ctx, domain)
if err != nil {
return fmt.Errorf("read zone: %w", err)
}
// Targets to remove: the wildcard + each canonical prefix.
targets := map[string]struct{}{
"*." + subdomain: {},
"console." + subdomain: {},
"gitea." + subdomain: {},
"harbor." + subdomain: {},
"admin." + subdomain: {},
"api." + subdomain: {},
}
keep := zone.SubRecords[:0]
for _, sr := range zone.SubRecords {
if _, drop := targets[sr.Subdomain]; drop {
continue
}
keep = append(keep, sr)
}
zone.SubRecords = keep
return c.writeZone(ctx, domain, zone)
}
// zoneSnapshot is the in-memory representation of the records returned by
// Dynadot's get_dns command, plus the apex (main) record set.
type zoneSnapshot struct {
MainRecords []mainRecord
SubRecords []subRecord
TTL int
}
type mainRecord struct {
Type string
Value string
}
type subRecord struct {
Subdomain string
Type string
Value string
}
// getZone reads the current zone via get_dns. Dynadot's response shape is
// nested under GetDnsResponse.Content.MxRecords + .NameServerSettings; we
// only care about the main + sub record arrays for the delete path.
func (c *Client) getZone(ctx context.Context, domain string) (*zoneSnapshot, error) {
params := url.Values{}
params.Set("key", c.APIKey)
params.Set("secret", c.APISecret)
params.Set("command", "get_dns")
params.Set("domain", domain)
endpoint := "https://api.dynadot.com/api3.json?" + params.Encode()
req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
if err != nil {
return nil, err
}
resp, err := c.HTTP.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
body, _ := io.ReadAll(resp.Body)
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("dynadot get_dns status %d: %s", resp.StatusCode, truncate(string(body), 256))
}
var raw struct {
GetDNSResponse struct {
ResponseHeader struct {
ResponseCode string `json:"ResponseCode"`
Status string `json:"Status"`
Error string `json:"Error"`
} `json:"ResponseHeader"`
Content struct {
NameServerSettings struct {
MainDomains []struct {
RecordType string `json:"record_type"`
Value string `json:"value"`
} `json:"MainDomains"`
SubDomains []struct {
Subhost string `json:"Subhost"`
RecordType string `json:"RecordType"`
Value string `json:"Value"`
} `json:"SubDomains"`
TTL int `json:"TTL"`
} `json:"NameServerSettings"`
} `json:"Content"`
} `json:"GetDnsResponse"`
}
if err := json.Unmarshal(body, &raw); err != nil {
return nil, fmt.Errorf("parse get_dns: %w (body=%s)", err, truncate(string(body), 256))
}
hdr := raw.GetDNSResponse.ResponseHeader
if !strings.EqualFold(hdr.Status, "success") && !strings.EqualFold(hdr.ResponseCode, "0") {
return nil, fmt.Errorf("dynadot get_dns: code=%s status=%s err=%s", hdr.ResponseCode, hdr.Status, hdr.Error)
}
out := &zoneSnapshot{TTL: raw.GetDNSResponse.Content.NameServerSettings.TTL}
for _, m := range raw.GetDNSResponse.Content.NameServerSettings.MainDomains {
out.MainRecords = append(out.MainRecords, mainRecord{Type: m.RecordType, Value: m.Value})
}
for _, s := range raw.GetDNSResponse.Content.NameServerSettings.SubDomains {
out.SubRecords = append(out.SubRecords, subRecord{Subdomain: s.Subhost, Type: s.RecordType, Value: s.Value})
}
return out, nil
}
// writeZone calls set_dns2 with add_dns_to_current_setting=NO and the full
// zone serialised. This is the dangerous code path the auto-memory warns
// about — we use it only when the caller has read the zone first via
// getZone and wants to write back a deliberate mutation.
func (c *Client) writeZone(ctx context.Context, domain string, zone *zoneSnapshot) error {
params := url.Values{}
params.Set("key", c.APIKey)
params.Set("secret", c.APISecret)
params.Set("command", "set_dns2")
params.Set("domain", domain)
// NOTE: deliberately NOT setting add_dns_to_current_setting — we want
// replace semantics here. The zone we serialise contains every row
// that was present minus the targeted deletions.
ttl := zone.TTL
if ttl == 0 {
ttl = 300
}
for i, m := range zone.MainRecords {
params.Set(fmt.Sprintf("main_record_type%d", i), m.Type)
params.Set(fmt.Sprintf("main_record%d", i), m.Value)
params.Set(fmt.Sprintf("main_recordx%d", i), fmt.Sprintf("%d", ttl))
}
for i, s := range zone.SubRecords {
params.Set(fmt.Sprintf("subdomain%d", i), s.Subdomain)
params.Set(fmt.Sprintf("sub_record_type%d", i), s.Type)
params.Set(fmt.Sprintf("sub_record%d", i), s.Value)
params.Set(fmt.Sprintf("sub_recordx%d", i), fmt.Sprintf("%d", ttl))
}
endpoint := "https://api.dynadot.com/api3.json?" + params.Encode()
req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
if err != nil {
return err
}
resp, err := c.HTTP.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
body, _ := io.ReadAll(resp.Body)
if resp.StatusCode != http.StatusOK {
return fmt.Errorf("dynadot set_dns2 status %d: %s", resp.StatusCode, truncate(string(body), 256))
}
var result struct {
SetDNS2Response struct {
ResponseHeader struct {
ResponseCode string `json:"ResponseCode"`
Status string `json:"Status"`
Error string `json:"Error"`
} `json:"ResponseHeader"`
} `json:"SetDns2Response"`
}
if err := json.Unmarshal(body, &result); err != nil {
return fmt.Errorf("parse set_dns2: %w (body=%s)", err, truncate(string(body), 256))
}
hdr := result.SetDNS2Response.ResponseHeader
if !strings.EqualFold(hdr.Status, "success") && !strings.EqualFold(hdr.ResponseCode, "0") {
return fmt.Errorf("dynadot set_dns2 error: code=%s status=%s err=%s", hdr.ResponseCode, hdr.Status, hdr.Error)
}
return nil
}
// managedDomainsState mirrors the catalyst-api dynadot package's runtime
// resolution: env-var first, then legacy single-domain fallback, then a
// minimal built-in default (kept ONLY so unit tests work without an env).
var managedDomainsState struct {
once sync.Once
set map[string]struct{}
}
func resolveManagedDomains() map[string]struct{} {
managedDomainsState.once.Do(func() {
managedDomainsState.set = computeManagedDomains()
})
return managedDomainsState.set
}
func computeManagedDomains() map[string]struct{} {
out := make(map[string]struct{})
if raw := os.Getenv("DYNADOT_MANAGED_DOMAINS"); strings.TrimSpace(raw) != "" {
for _, tok := range splitDomainsList(raw) {
out[tok] = struct{}{}
}
if len(out) > 0 {
return out
}
}
if d := strings.ToLower(strings.TrimSpace(os.Getenv("DYNADOT_DOMAIN"))); d != "" {
out[d] = struct{}{}
return out
}
out["openova.io"] = struct{}{}
out["omani.works"] = struct{}{}
return out
}
// ResetManagedDomains clears the cache so tests can re-evaluate after
// mutating env vars.
func ResetManagedDomains() {
managedDomainsState.once = sync.Once{}
managedDomainsState.set = nil
}
// ManagedDomains returns a sorted, deduplicated copy of the configured
// managed-domain list. Useful for /healthz exposure and operator logs.
func ManagedDomains() []string {
set := resolveManagedDomains()
out := make([]string, 0, len(set))
for d := range set {
out = append(out, d)
}
for i := 1; i < len(out); i++ {
for j := i; j > 0 && out[j-1] > out[j]; j-- {
out[j-1], out[j] = out[j], out[j-1]
}
}
return out
}
// IsManagedDomain reports whether the given domain is one whose DNS Dynadot
// manages on behalf of OpenOva.
func IsManagedDomain(domain string) bool {
domain = strings.ToLower(strings.TrimSpace(domain))
if domain == "" {
return false
}
_, ok := resolveManagedDomains()[domain]
return ok
}
// splitDomainsList parses a `DYNADOT_MANAGED_DOMAINS`-style string —
// comma- or whitespace-separated, lower-cased, trimmed, deduped.
func splitDomainsList(raw string) []string {
raw = strings.ToLower(raw)
raw = strings.ReplaceAll(raw, ",", " ")
parts := strings.Fields(raw)
seen := make(map[string]struct{}, len(parts))
out := make([]string, 0, len(parts))
for _, p := range parts {
p = strings.TrimSpace(p)
if p == "" {
continue
}
if _, ok := seen[p]; ok {
continue
}
seen[p] = struct{}{}
out = append(out, p)
}
return out
}
func truncate(s string, max int) string {
if len(s) <= max {
return s
}
return s[:max] + "..."
}
// Errors surfaced by the package for callers that want to type-switch.
var (
// ErrUnmanagedDomain — caller asked for an action against a domain not in
// DYNADOT_MANAGED_DOMAINS. Hard fail to defend against misconfiguration.
ErrUnmanagedDomain = errors.New("domain is not in the Dynadot managed list")
)

View File

@ -0,0 +1,62 @@
package dynadot
import (
"os"
"sort"
"strings"
"testing"
)
func TestIsManagedDomainEnvVar(t *testing.T) {
t.Setenv("DYNADOT_MANAGED_DOMAINS", "openova.io, omani.works acme.io")
t.Setenv("DYNADOT_DOMAIN", "")
ResetManagedDomains()
for _, d := range []string{"openova.io", "omani.works", "acme.io"} {
if !IsManagedDomain(d) {
t.Errorf("IsManagedDomain(%q) = false, want true", d)
}
}
if IsManagedDomain("not-managed.com") {
t.Errorf("IsManagedDomain(not-managed.com) = true, want false")
}
got := ManagedDomains()
sort.Strings(got)
want := []string{"acme.io", "omani.works", "openova.io"}
if strings.Join(got, ",") != strings.Join(want, ",") {
t.Errorf("ManagedDomains() = %v, want %v", got, want)
}
}
func TestIsManagedDomainLegacyFallback(t *testing.T) {
t.Setenv("DYNADOT_MANAGED_DOMAINS", "")
t.Setenv("DYNADOT_DOMAIN", "legacy.io")
ResetManagedDomains()
if !IsManagedDomain("legacy.io") {
t.Errorf("IsManagedDomain(legacy.io) = false, want true")
}
if IsManagedDomain("openova.io") {
t.Errorf("legacy fallback should not include built-in defaults")
}
}
func TestIsManagedDomainBuiltInDefaults(t *testing.T) {
os.Unsetenv("DYNADOT_MANAGED_DOMAINS")
os.Unsetenv("DYNADOT_DOMAIN")
ResetManagedDomains()
for _, d := range []string{"openova.io", "omani.works"} {
if !IsManagedDomain(d) {
t.Errorf("built-in default missing %q", d)
}
}
}
func TestSplitDomainsList(t *testing.T) {
got := splitDomainsList("Foo.com, BAR.IO\tbaz.io ,foo.com")
want := []string{"foo.com", "bar.io", "baz.io"}
if strings.Join(got, ",") != strings.Join(want, ",") {
t.Errorf("splitDomainsList = %v, want %v", got, want)
}
}

View File

@ -0,0 +1,451 @@
// Package handler — HTTP surface for pool-domain-manager.
//
// Endpoints (all JSON; per the issue body):
//
// GET /api/v1/pool/{domain}/check?sub=X Fast read; PDM-DB only.
// POST /api/v1/pool/{domain}/reserve Atomic reserve; 10-min TTL.
// POST /api/v1/pool/{domain}/commit Promote → ACTIVE + Dynadot.
// DELETE /api/v1/pool/{domain}/release Free; remove Dynadot.
// GET /api/v1/pool/{domain}/list Operator-facing list.
// GET /api/v1/reserved Public reserved-name list.
// GET /healthz Liveness probe.
//
// Per docs/INVIOLABLE-PRINCIPLES.md #4 the handler does not hardcode domain
// names — every value comes from the URL path or request body, validated
// against the runtime DYNADOT_MANAGED_DOMAINS list.
package handler
import (
"encoding/json"
"errors"
"log/slog"
"net/http"
"strings"
"github.com/go-chi/chi/v5"
"github.com/openova-io/openova/core/pool-domain-manager/internal/allocator"
"github.com/openova-io/openova/core/pool-domain-manager/internal/dynadot"
"github.com/openova-io/openova/core/pool-domain-manager/internal/reserved"
"github.com/openova-io/openova/core/pool-domain-manager/internal/store"
)
// Handler holds the dependencies shared by every endpoint.
type Handler struct {
Alloc *allocator.Allocator
Store *store.Store // exposed for /healthz Ping
Log *slog.Logger
}
// New constructs a Handler.
func New(alloc *allocator.Allocator, s *store.Store, log *slog.Logger) *Handler {
return &Handler{Alloc: alloc, Store: s, Log: log}
}
// Routes returns the chi.Router with all PDM routes wired up.
func (h *Handler) Routes() *chi.Mux {
r := chi.NewRouter()
r.Get("/healthz", h.Healthz)
r.Route("/api/v1", func(r chi.Router) {
r.Get("/reserved", h.ListReserved)
r.Route("/pool/{domain}", func(r chi.Router) {
r.Get("/check", h.Check)
r.Get("/list", h.List)
r.Post("/reserve", h.Reserve)
r.Post("/commit", h.Commit)
r.Delete("/release", h.Release)
})
})
return r
}
// ── Healthz ────────────────────────────────────────────────────────────
// Healthz returns 200 if Postgres is reachable and the dynadot config is
// loaded; otherwise 503. The response includes the runtime managed-domain
// list so operators can grep for misconfiguration.
func (h *Handler) Healthz(w http.ResponseWriter, r *http.Request) {
if err := h.Store.Ping(r.Context()); err != nil {
writeJSON(w, http.StatusServiceUnavailable, map[string]any{
"status": "unhealthy",
"db": err.Error(),
})
return
}
writeJSON(w, http.StatusOK, map[string]any{
"status": "ok",
"managedDomains": dynadot.ManagedDomains(),
})
}
// ── Reserved-list ──────────────────────────────────────────────────────
// ListReserved exposes the canonical reserved-subdomain list. The wizard can
// consume this to render an inline hint instead of waiting for the user to
// type a reserved name and seeing an error.
func (h *Handler) ListReserved(w http.ResponseWriter, r *http.Request) {
writeJSON(w, http.StatusOK, map[string]any{
"reserved": reserved.All(),
})
}
// ── /pool/{domain}/check ───────────────────────────────────────────────
// Check is the read-only availability query. Always returns 200 OK with a
// JSON body — clients use the body's `available` field, not the HTTP
// status, to decide.
func (h *Handler) Check(w http.ResponseWriter, r *http.Request) {
domain := normaliseLabel(chi.URLParam(r, "domain"))
sub := normaliseLabel(r.URL.Query().Get("sub"))
if !isValidDNSLabel(sub) {
writeJSON(w, http.StatusOK, allocator.CheckResult{
Available: false,
Reason: "invalid-format",
Detail: "subdomain must be a-z, 0-9 and hyphens, start with a letter, max 63 characters",
})
return
}
res, err := h.Alloc.Check(r.Context(), domain, sub)
if err != nil {
h.Log.Error("check failed", "domain", domain, "sub", sub, "err", err)
writeJSON(w, http.StatusInternalServerError, map[string]string{"error": err.Error()})
return
}
writeJSON(w, http.StatusOK, res)
}
// ── /pool/{domain}/reserve ─────────────────────────────────────────────
// ReserveRequest is the body shape POSTed to /reserve.
type ReserveRequest struct {
Subdomain string `json:"subdomain"`
CreatedBy string `json:"createdBy,omitempty"`
}
// ReserveResponse is the wire shape returned to the caller.
type ReserveResponse struct {
PoolDomain string `json:"poolDomain"`
Subdomain string `json:"subdomain"`
State string `json:"state"`
ReservedAt string `json:"reservedAt"`
ExpiresAt string `json:"expiresAt"`
ReservationToken string `json:"reservationToken"`
CreatedBy string `json:"createdBy"`
}
// Reserve atomically reserves the (domain, subdomain) pair for the
// configured TTL. Returns 201 Created on success, 409 Conflict if the name
// is taken, 422 Unprocessable Entity on validation failure.
func (h *Handler) Reserve(w http.ResponseWriter, r *http.Request) {
domain := normaliseLabel(chi.URLParam(r, "domain"))
var req ReserveRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
writeJSON(w, http.StatusBadRequest, map[string]string{"error": "invalid JSON body"})
return
}
sub := normaliseLabel(req.Subdomain)
if !isValidDNSLabel(sub) {
writeJSON(w, http.StatusUnprocessableEntity, map[string]string{
"error": "invalid-format",
"detail": "subdomain must be a-z, 0-9 and hyphens, start with a letter, max 63 characters",
})
return
}
alloc, err := h.Alloc.Reserve(r.Context(), domain, sub, allocator.ReserveInput{CreatedBy: req.CreatedBy})
if err != nil {
switch {
case errors.Is(err, store.ErrConflict):
writeJSON(w, http.StatusConflict, map[string]string{
"error": "conflict",
"detail": "this subdomain is already reserved or active",
})
case errors.Is(err, dynadot.ErrUnmanagedDomain):
writeJSON(w, http.StatusUnprocessableEntity, map[string]string{
"error": "unsupported-pool",
"detail": "pool domain " + domain + " is not managed by OpenOva",
})
default:
h.Log.Error("reserve failed", "domain", domain, "sub", sub, "err", err)
writeJSON(w, http.StatusInternalServerError, map[string]string{"error": err.Error()})
}
return
}
resp := ReserveResponse{
PoolDomain: alloc.PoolDomain,
Subdomain: alloc.Subdomain,
State: string(alloc.State),
ReservedAt: alloc.ReservedAt.Format("2006-01-02T15:04:05Z07:00"),
ReservationToken: alloc.ReservationToken,
CreatedBy: alloc.CreatedBy,
}
if alloc.ExpiresAt != nil {
resp.ExpiresAt = alloc.ExpiresAt.Format("2006-01-02T15:04:05Z07:00")
}
writeJSON(w, http.StatusCreated, resp)
}
// ── /pool/{domain}/commit ──────────────────────────────────────────────
// CommitRequest carries the data needed to flip RESERVED → ACTIVE.
type CommitRequest struct {
Subdomain string `json:"subdomain"`
ReservationToken string `json:"reservationToken"`
SovereignFQDN string `json:"sovereignFQDN"`
LoadBalancerIP string `json:"loadBalancerIP"`
}
// Commit promotes a reservation to active and writes Dynadot records.
// Status codes:
//
// 200 OK — committed; row is active and DNS records exist
// 202 Accepted — committed in DB but Dynadot write failed (caller
// can retry Commit with same body; idempotent)
// 404 Not Found — no row exists for this (domain, subdomain)
// 409 Conflict — row is already active (re-commit attempt)
// 410 Gone — reservation TTL expired before commit; caller must
// Reserve again
// 403 Forbidden — reservation token mismatch
func (h *Handler) Commit(w http.ResponseWriter, r *http.Request) {
domain := normaliseLabel(chi.URLParam(r, "domain"))
var req CommitRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
writeJSON(w, http.StatusBadRequest, map[string]string{"error": "invalid JSON body"})
return
}
sub := normaliseLabel(req.Subdomain)
if !isValidDNSLabel(sub) {
writeJSON(w, http.StatusUnprocessableEntity, map[string]string{
"error": "invalid-format",
"detail": "subdomain must be a-z, 0-9 and hyphens, start with a letter, max 63 characters",
})
return
}
if strings.TrimSpace(req.LoadBalancerIP) == "" {
writeJSON(w, http.StatusUnprocessableEntity, map[string]string{
"error": "missing-lb-ip",
"detail": "loadBalancerIP is required for commit",
})
return
}
alloc, err := h.Alloc.Commit(r.Context(), domain, sub, allocator.CommitInput{
ReservationToken: req.ReservationToken,
SovereignFQDN: req.SovereignFQDN,
LoadBalancerIP: req.LoadBalancerIP,
})
if err != nil {
// Allocator returns a wrapped "dynadot write" error AFTER the row was
// flipped to active. Surface 202 in that case so the caller knows the
// row is committed but DNS is pending.
if alloc != nil && strings.Contains(err.Error(), "dynadot write") {
writeJSON(w, http.StatusAccepted, map[string]any{
"warning": "row committed but Dynadot write failed; retry commit to publish DNS",
"detail": err.Error(),
"poolDomain": alloc.PoolDomain,
"subdomain": alloc.Subdomain,
"state": string(alloc.State),
"sovereignFQDN": alloc.SovereignFQDN,
"loadBalancerIP": alloc.LoadBalancerIP,
})
return
}
switch {
case errors.Is(err, store.ErrNotFound):
writeJSON(w, http.StatusNotFound, map[string]string{
"error": "not-found",
"detail": "no reservation exists for this (poolDomain, subdomain) — call /reserve first",
})
case errors.Is(err, store.ErrConflict):
writeJSON(w, http.StatusConflict, map[string]string{
"error": "already-active",
"detail": "this allocation is already active",
})
case errors.Is(err, store.ErrTokenMismatch):
writeJSON(w, http.StatusForbidden, map[string]string{
"error": "token-mismatch",
"detail": "reservation token does not match the held reservation",
})
case errors.Is(err, store.ErrExpired):
writeJSON(w, http.StatusGone, map[string]string{
"error": "reservation-expired",
"detail": "the reservation TTL elapsed before commit; reserve again",
})
case errors.Is(err, dynadot.ErrUnmanagedDomain):
writeJSON(w, http.StatusUnprocessableEntity, map[string]string{
"error": "unsupported-pool",
"detail": "pool domain " + domain + " is not managed by OpenOva",
})
default:
h.Log.Error("commit failed", "domain", domain, "sub", sub, "err", err)
writeJSON(w, http.StatusInternalServerError, map[string]string{"error": err.Error()})
}
return
}
writeJSON(w, http.StatusOK, allocationResponse(alloc))
}
// ── /pool/{domain}/release ─────────────────────────────────────────────
// ReleaseRequest is the body shape DELETEd to /release. We accept a body
// rather than a query param so the wire shape matches reserve/commit.
type ReleaseRequest struct {
Subdomain string `json:"subdomain"`
}
// Release deletes the row and removes Dynadot records (when state was
// active). Returns 200 OK on success, 404 if no row, 422 on validation.
//
// We do NOT require the reservation token here — Release is operator-side
// (also invoked by catalyst-api on tofu destroy) and the catalyst-api may
// not still hold the original token by the time the destroy fires.
func (h *Handler) Release(w http.ResponseWriter, r *http.Request) {
domain := normaliseLabel(chi.URLParam(r, "domain"))
var req ReleaseRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
// Allow ?sub= query fallback so curl -X DELETE without a body works.
req.Subdomain = r.URL.Query().Get("sub")
}
sub := normaliseLabel(req.Subdomain)
if !isValidDNSLabel(sub) {
writeJSON(w, http.StatusUnprocessableEntity, map[string]string{
"error": "invalid-format",
"detail": "subdomain must be a-z, 0-9 and hyphens, start with a letter, max 63 characters",
})
return
}
alloc, err := h.Alloc.Release(r.Context(), domain, sub)
if err != nil {
// Partial: row deleted but Dynadot delete failed.
if alloc != nil && strings.Contains(err.Error(), "dynadot delete") {
writeJSON(w, http.StatusAccepted, map[string]any{
"warning": "row deleted but Dynadot delete failed; operator must clean up DNS manually",
"detail": err.Error(),
"freed": allocationResponse(alloc),
})
return
}
switch {
case errors.Is(err, store.ErrNotFound):
writeJSON(w, http.StatusNotFound, map[string]string{
"error": "not-found",
"detail": "no allocation exists for this (poolDomain, subdomain)",
})
case errors.Is(err, dynadot.ErrUnmanagedDomain):
writeJSON(w, http.StatusUnprocessableEntity, map[string]string{
"error": "unsupported-pool",
"detail": "pool domain " + domain + " is not managed by OpenOva",
})
default:
h.Log.Error("release failed", "domain", domain, "sub", sub, "err", err)
writeJSON(w, http.StatusInternalServerError, map[string]string{"error": err.Error()})
}
return
}
writeJSON(w, http.StatusOK, map[string]any{
"freed": allocationResponse(alloc),
})
}
// ── /pool/{domain}/list ────────────────────────────────────────────────
// List returns every allocation for the given pool domain. Operator-only;
// the manifest gates the path behind a Traefik auth middleware.
func (h *Handler) List(w http.ResponseWriter, r *http.Request) {
domain := normaliseLabel(chi.URLParam(r, "domain"))
allocs, err := h.Alloc.List(r.Context(), domain)
if err != nil {
if errors.Is(err, dynadot.ErrUnmanagedDomain) {
writeJSON(w, http.StatusUnprocessableEntity, map[string]string{
"error": "unsupported-pool",
"detail": "pool domain " + domain + " is not managed by OpenOva",
})
return
}
h.Log.Error("list failed", "domain", domain, "err", err)
writeJSON(w, http.StatusInternalServerError, map[string]string{"error": err.Error()})
return
}
out := make([]map[string]any, 0, len(allocs))
for i := range allocs {
out = append(out, allocationResponse(&allocs[i]))
}
writeJSON(w, http.StatusOK, map[string]any{
"poolDomain": domain,
"allocations": out,
})
}
// ── helpers ────────────────────────────────────────────────────────────
func writeJSON(w http.ResponseWriter, code int, v any) {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(code)
_ = json.NewEncoder(w).Encode(v)
}
func normaliseLabel(s string) string {
return strings.ToLower(strings.TrimSpace(s))
}
// isValidDNSLabel validates an RFC 1035 label (lower-case-only, since we
// normalise upstream).
func isValidDNSLabel(s string) bool {
if s == "" || len(s) > 63 {
return false
}
// Allow domain labels with embedded dots ONLY for the {domain} URL
// param — those are validated separately via dynadot.IsManagedDomain.
// For subdomain inputs we require a single label.
for i, r := range s {
switch {
case r >= 'a' && r <= 'z':
continue
case r >= '0' && r <= '9':
if i == 0 {
return false
}
continue
case r == '-':
if i == 0 || i == len(s)-1 {
return false
}
continue
default:
return false
}
}
return true
}
func allocationResponse(a *store.Allocation) map[string]any {
out := map[string]any{
"poolDomain": a.PoolDomain,
"subdomain": a.Subdomain,
"state": string(a.State),
"reservedAt": a.ReservedAt.Format("2006-01-02T15:04:05Z07:00"),
"createdBy": a.CreatedBy,
}
if a.ExpiresAt != nil {
out["expiresAt"] = a.ExpiresAt.Format("2006-01-02T15:04:05Z07:00")
}
if a.SovereignFQDN != "" {
out["sovereignFQDN"] = a.SovereignFQDN
}
if a.LoadBalancerIP != "" {
out["loadBalancerIP"] = a.LoadBalancerIP
}
if a.ReservationToken != "" {
out["reservationToken"] = a.ReservationToken
}
return out
}

View File

@ -0,0 +1,77 @@
// Package reserved holds the canonical list of subdomain names that no tenant
// may claim under any OpenOva pool domain. It used to live in
// products/catalyst/bootstrap/api/internal/handler/subdomains.go as a private
// var — duplicated knowledge with no clear owner.
//
// Per docs/INVIOLABLE-PRINCIPLES.md #4 the list lives in ONE place: PDM.
// catalyst-api consults PDM via /check; the wizard consults catalyst-api via
// /api/v1/subdomains/check. There is no second copy of this list anywhere
// else in the fleet.
//
// The list is the union of:
// - control-plane prefixes the OpenTofu module will materialise as DNS
// records on every Sovereign (api, admin, console, gitea, harbor — see
// dynadot.AddSovereignRecords)
// - infrastructure prefixes that map to specific OpenOva services
// (openova, catalyst, openbao, vault, flux, k8s, system)
// - operational prefixes that look enough like a Sovereign to be
// dangerous if a tenant grabbed them (www, mail, smtp, imap, vpn, app,
// status, docs)
//
// The IsReserved() function is the only exported surface — callers don't
// see (and can't mutate) the underlying map.
package reserved
import "strings"
// reservedSubdomains — names we never let a tenant claim as their Sovereign
// root subdomain. Tenants get *.omantel.omani.works style records
// automatically; allowing a tenant to claim "console" would create
// "console.console.omani.works" which is meaningless and confusing.
var reservedSubdomains = map[string]struct{}{
"api": {},
"admin": {},
"console": {},
"gitea": {},
"harbor": {},
"keycloak": {},
"www": {},
"mail": {},
"smtp": {},
"imap": {},
"vpn": {},
"openova": {},
"catalyst": {},
"docs": {},
"status": {},
"app": {},
"system": {},
"openbao": {},
"vault": {},
"flux": {},
"k8s": {},
}
// IsReserved reports whether the given subdomain (lower-cased, trimmed) is
// in the reserved set. Caller is responsible for validating the input is a
// well-formed DNS label first; this function only checks set membership.
func IsReserved(subdomain string) bool {
_, ok := reservedSubdomains[strings.ToLower(strings.TrimSpace(subdomain))]
return ok
}
// All returns a sorted copy of the reserved list. Used by /api/v1/reserved
// for clients (e.g. the wizard) that want to render the list inline as a
// hint to the user.
func All() []string {
out := make([]string, 0, len(reservedSubdomains))
for k := range reservedSubdomains {
out = append(out, k)
}
for i := 1; i < len(out); i++ {
for j := i; j > 0 && out[j-1] > out[j]; j-- {
out[j-1], out[j] = out[j], out[j-1]
}
}
return out
}

View File

@ -0,0 +1,45 @@
package reserved
import "testing"
func TestIsReserved(t *testing.T) {
cases := map[string]bool{
"api": true,
"console": true,
"openbao": true,
"k8s": true,
"omantel": false,
"acme": false,
"": false,
"API": true, // case-insensitive
" api ": true, // trims whitespace
"foo-bar": false,
"my-corp": false,
}
for k, want := range cases {
if got := IsReserved(k); got != want {
t.Errorf("IsReserved(%q) = %v, want %v", k, got, want)
}
}
}
func TestAllSorted(t *testing.T) {
names := All()
if len(names) == 0 {
t.Fatal("expected non-empty reserved list")
}
for i := 1; i < len(names); i++ {
if names[i-1] >= names[i] {
t.Errorf("All() not sorted at index %d: %q >= %q", i, names[i-1], names[i])
}
}
}
func TestAllNoReachableNames(t *testing.T) {
// Sanity-check: every name in All() must be IsReserved.
for _, n := range All() {
if !IsReserved(n) {
t.Errorf("%q is in All() but IsReserved returned false", n)
}
}
}

View File

@ -0,0 +1,31 @@
-- pool-domain-manager schema, applied idempotently at process start.
-- Per docs/INVIOLABLE-PRINCIPLES.md #3 the database is CloudNativePG and
-- the schema lives in this single file (no Atlas / Goose / Flyway needed
-- for two tables); running the same SQL repeatedly is safe.
--
-- The CHECK constraint on state and the PRIMARY KEY on (pool_domain,
-- subdomain) together guarantee that no two callers can hold a name in
-- conflicting states. The expires_at index speeds up the sweeper.
CREATE TABLE IF NOT EXISTS pool_allocations (
pool_domain TEXT NOT NULL,
subdomain TEXT NOT NULL,
state TEXT NOT NULL CHECK (state IN ('reserved', 'active')),
reserved_at TIMESTAMPTZ NOT NULL,
expires_at TIMESTAMPTZ,
sovereign_fqdn TEXT,
load_balancer_ip TEXT,
reservation_token UUID,
created_by TEXT NOT NULL,
PRIMARY KEY (pool_domain, subdomain)
);
-- Sweeper-friendly partial index — only the (small) set of reserved rows
-- need to be scanned for TTL expiry.
CREATE INDEX IF NOT EXISTS pool_allocations_expires_idx
ON pool_allocations (expires_at)
WHERE state = 'reserved';
-- Operator-facing index — list all active rows for a pool fast.
CREATE INDEX IF NOT EXISTS pool_allocations_state_idx
ON pool_allocations (pool_domain, state);

View File

@ -0,0 +1,463 @@
// Package store — CloudNativePG / Postgres persistence for pool-domain-manager.
//
// The PDM owns a single table — pool_allocations — that holds the canonical
// allocation state for every (pool_domain, subdomain) pair the OpenOva fleet
// has ever reserved or activated. The table is intentionally simple: PDM is
// a small, stateless HTTP service backed by a single-writer Postgres database
// (CloudNativePG running in the openova-system namespace). Concurrency is
// resolved by Postgres row-level locks + UPSERT semantics rather than any
// application-level mutex.
//
// Schema (also encoded as a migration in migrations.sql):
//
// CREATE TABLE pool_allocations (
// pool_domain TEXT NOT NULL,
// subdomain TEXT NOT NULL,
// state TEXT NOT NULL CHECK (state IN ('reserved','active')),
// reserved_at TIMESTAMPTZ NOT NULL,
// expires_at TIMESTAMPTZ, -- NULL when state='active'
// sovereign_fqdn TEXT, -- set when state='active'
// load_balancer_ip TEXT, -- set when state='active'
// reservation_token UUID, -- set when state='reserved'
// created_by TEXT NOT NULL,
// PRIMARY KEY (pool_domain, subdomain)
// );
// CREATE INDEX pool_allocations_expires_idx ON pool_allocations (expires_at)
// WHERE state = 'reserved';
//
// Per docs/INVIOLABLE-PRINCIPLES.md #4 the connection string is read from the
// PDM_DATABASE_URL env var — never hardcoded. The K8s ExternalSecret pulls
// the credentials out of CNPG's auto-generated app secret and projects them
// here.
package store
import (
"context"
_ "embed"
"errors"
"fmt"
"time"
"github.com/google/uuid"
"github.com/jackc/pgx/v5"
"github.com/jackc/pgx/v5/pgconn"
"github.com/jackc/pgx/v5/pgxpool"
)
// State enumerates the three lifecycle states of a (pool, subdomain) row.
// NULL — implicit (no row) — is the fourth state, represented by absence.
type State string
const (
// StateReserved — the name is held with a TTL. expires_at is non-NULL.
// On TTL expiry the row is deleted by the sweeper goroutine.
StateReserved State = "reserved"
// StateActive — the name has been committed and Dynadot DNS records have
// been written. expires_at is NULL; the row stays until Release.
StateActive State = "active"
)
// ErrConflict — somebody else holds this (pool_domain, subdomain) — used by
// the allocator to map to HTTP 409 Conflict.
var ErrConflict = errors.New("pool allocation conflict — name already reserved or active")
// ErrNotFound — no row exists for the (pool_domain, subdomain) pair. The
// handlers map this to HTTP 404.
var ErrNotFound = errors.New("pool allocation not found")
// Allocation is the persistent shape of a row in pool_allocations.
type Allocation struct {
PoolDomain string `json:"poolDomain"`
Subdomain string `json:"subdomain"`
State State `json:"state"`
ReservedAt time.Time `json:"reservedAt"`
ExpiresAt *time.Time `json:"expiresAt,omitempty"`
SovereignFQDN string `json:"sovereignFQDN,omitempty"`
LoadBalancerIP string `json:"loadBalancerIP,omitempty"`
ReservationToken string `json:"reservationToken,omitempty"`
CreatedBy string `json:"createdBy"`
}
// Store wraps the pgxpool.Pool with the SQL operations PDM needs.
type Store struct {
pool *pgxpool.Pool
}
// New connects to Postgres using the DSN, applies migrations, and returns a
// ready-to-use Store. Caller is responsible for calling Close on shutdown.
func New(ctx context.Context, dsn string) (*Store, error) {
cfg, err := pgxpool.ParseConfig(dsn)
if err != nil {
return nil, fmt.Errorf("parse PDM_DATABASE_URL: %w", err)
}
// Modest pool — PDM handles low QPS (a wizard click rate of order 0.1/s
// fleet-wide) and we'd rather queue than monopolise CNPG connections.
cfg.MaxConns = 8
cfg.MinConns = 1
cfg.MaxConnLifetime = 30 * time.Minute
cfg.MaxConnIdleTime = 5 * time.Minute
pool, err := pgxpool.NewWithConfig(ctx, cfg)
if err != nil {
return nil, fmt.Errorf("connect postgres: %w", err)
}
if err := pool.Ping(ctx); err != nil {
pool.Close()
return nil, fmt.Errorf("ping postgres: %w", err)
}
s := &Store{pool: pool}
if err := s.migrate(ctx); err != nil {
pool.Close()
return nil, fmt.Errorf("apply migrations: %w", err)
}
return s, nil
}
// Close releases the underlying connection pool.
func (s *Store) Close() {
s.pool.Close()
}
// Ping verifies the database is reachable. Used by /healthz.
func (s *Store) Ping(ctx context.Context) error {
return s.pool.Ping(ctx)
}
//go:embed migrations.sql
var migrationsSQL string
// migrate applies the embedded migrations.sql idempotently. We deliberately
// avoid a full migration framework — the schema is tiny and idempotent SQL
// keeps PDM's startup path one TCP connect + one CREATE TABLE IF NOT EXISTS.
func (s *Store) migrate(ctx context.Context) error {
_, err := s.pool.Exec(ctx, migrationsSQL)
return err
}
// Reserve atomically inserts a row in state='reserved' with the given TTL.
// Returns ErrConflict if a row already exists for the (pool, subdomain) pair
// in any state — including an expired reservation that the sweeper has not
// yet collected (we treat sweeper lag conservatively and rely on the caller
// running ExpireReservations periodically; the SQL also prunes expired rows
// for the specific key it touches as part of the same transaction).
func (s *Store) Reserve(ctx context.Context, poolDomain, subdomain string, ttl time.Duration, createdBy string) (*Allocation, error) {
if ttl <= 0 {
return nil, fmt.Errorf("Reserve: ttl must be positive, got %s", ttl)
}
tx, err := s.pool.Begin(ctx)
if err != nil {
return nil, fmt.Errorf("begin tx: %w", err)
}
defer tx.Rollback(ctx) //nolint:errcheck
// Step 1: opportunistic prune — if a row exists for this exact key with
// state='reserved' and expires_at in the past, delete it. This keeps the
// reservation path responsive even if the background sweeper is slow.
if _, err := tx.Exec(ctx, `
DELETE FROM pool_allocations
WHERE pool_domain = $1
AND subdomain = $2
AND state = 'reserved'
AND expires_at < NOW()
`, poolDomain, subdomain); err != nil {
return nil, fmt.Errorf("prune expired: %w", err)
}
now := time.Now().UTC()
expires := now.Add(ttl)
token := uuid.New()
// Step 2: insert. If the row already exists (active OR not-yet-expired
// reservation) the unique constraint fires and we return ErrConflict.
_, err = tx.Exec(ctx, `
INSERT INTO pool_allocations
(pool_domain, subdomain, state, reserved_at, expires_at, reservation_token, created_by)
VALUES
($1, $2, 'reserved', $3, $4, $5, $6)
`, poolDomain, subdomain, now, expires, token, createdBy)
if err != nil {
var pgErr *pgconn.PgError
if errors.As(err, &pgErr) && pgErr.Code == "23505" /* unique_violation */ {
return nil, ErrConflict
}
return nil, fmt.Errorf("insert reservation: %w", err)
}
if err := tx.Commit(ctx); err != nil {
return nil, fmt.Errorf("commit reservation: %w", err)
}
exp := expires
return &Allocation{
PoolDomain: poolDomain,
Subdomain: subdomain,
State: StateReserved,
ReservedAt: now,
ExpiresAt: &exp,
ReservationToken: token.String(),
CreatedBy: createdBy,
}, nil
}
// Get returns the current allocation for the (pool, subdomain) pair, or
// ErrNotFound. Get does NOT prune expired reservations on read — callers
// who want fresh results should run after ExpireReservations or accept that
// /check may briefly return state='reserved' for a row that has just expired.
func (s *Store) Get(ctx context.Context, poolDomain, subdomain string) (*Allocation, error) {
row := s.pool.QueryRow(ctx, `
SELECT pool_domain, subdomain, state, reserved_at, expires_at,
sovereign_fqdn, load_balancer_ip, reservation_token, created_by
FROM pool_allocations
WHERE pool_domain = $1 AND subdomain = $2
`, poolDomain, subdomain)
var a Allocation
var (
expiresAt *time.Time
sovereignFQDN *string
loadBalancerIP *string
reservationToken *uuid.UUID
)
err := row.Scan(
&a.PoolDomain,
&a.Subdomain,
&a.State,
&a.ReservedAt,
&expiresAt,
&sovereignFQDN,
&loadBalancerIP,
&reservationToken,
&a.CreatedBy,
)
if errors.Is(err, pgx.ErrNoRows) {
return nil, ErrNotFound
}
if err != nil {
return nil, fmt.Errorf("scan allocation: %w", err)
}
a.ExpiresAt = expiresAt
if sovereignFQDN != nil {
a.SovereignFQDN = *sovereignFQDN
}
if loadBalancerIP != nil {
a.LoadBalancerIP = *loadBalancerIP
}
if reservationToken != nil {
a.ReservationToken = reservationToken.String()
}
return &a, nil
}
// IsAvailable reports whether the (pool, subdomain) pair is free for a fresh
// reservation. A row in state='reserved' that has expired is treated as free
// (we transparently prune it on next Reserve). All other rows = taken.
func (s *Store) IsAvailable(ctx context.Context, poolDomain, subdomain string) (bool, error) {
a, err := s.Get(ctx, poolDomain, subdomain)
if errors.Is(err, ErrNotFound) {
return true, nil
}
if err != nil {
return false, err
}
if a.State == StateReserved && a.ExpiresAt != nil && a.ExpiresAt.Before(time.Now().UTC()) {
return true, nil
}
return false, nil
}
// CommitInput is what the /commit endpoint provides.
type CommitInput struct {
ReservationToken string
SovereignFQDN string
LoadBalancerIP string
}
// Commit promotes an existing reservation to state='active'. Verifies the
// reservation_token matches (so a stale wizard tab can't commit somebody
// else's reservation) and that the reservation has not yet expired.
//
// Returns ErrNotFound if the row is gone, ErrConflict if it is already
// active, ErrTokenMismatch if the token doesn't match, and ErrExpired if the
// reservation TTL elapsed.
func (s *Store) Commit(ctx context.Context, poolDomain, subdomain string, in CommitInput) (*Allocation, error) {
tx, err := s.pool.Begin(ctx)
if err != nil {
return nil, fmt.Errorf("begin tx: %w", err)
}
defer tx.Rollback(ctx) //nolint:errcheck
// Lock the row for the duration of this transaction.
row := tx.QueryRow(ctx, `
SELECT state, expires_at, reservation_token
FROM pool_allocations
WHERE pool_domain = $1 AND subdomain = $2
FOR UPDATE
`, poolDomain, subdomain)
var (
state State
expiresAt *time.Time
reservationToken *uuid.UUID
)
if err := row.Scan(&state, &expiresAt, &reservationToken); err != nil {
if errors.Is(err, pgx.ErrNoRows) {
return nil, ErrNotFound
}
return nil, fmt.Errorf("lock row: %w", err)
}
if state == StateActive {
return nil, ErrConflict
}
if expiresAt != nil && expiresAt.Before(time.Now().UTC()) {
return nil, ErrExpired
}
wantToken, err := uuid.Parse(in.ReservationToken)
if err != nil {
return nil, ErrTokenMismatch
}
if reservationToken == nil || *reservationToken != wantToken {
return nil, ErrTokenMismatch
}
if _, err := tx.Exec(ctx, `
UPDATE pool_allocations
SET state = 'active',
expires_at = NULL,
reservation_token = NULL,
sovereign_fqdn = $3,
load_balancer_ip = $4
WHERE pool_domain = $1 AND subdomain = $2
`, poolDomain, subdomain, in.SovereignFQDN, in.LoadBalancerIP); err != nil {
return nil, fmt.Errorf("commit update: %w", err)
}
if err := tx.Commit(ctx); err != nil {
return nil, fmt.Errorf("commit tx: %w", err)
}
return s.Get(ctx, poolDomain, subdomain)
}
// Release deletes the row for (pool, subdomain) regardless of state. Returns
// the released row's previous state so the handler can decide whether to
// fire Dynadot delete calls (only state='active' rows have DNS records).
func (s *Store) Release(ctx context.Context, poolDomain, subdomain string) (*Allocation, error) {
row := s.pool.QueryRow(ctx, `
DELETE FROM pool_allocations
WHERE pool_domain = $1 AND subdomain = $2
RETURNING pool_domain, subdomain, state, reserved_at, expires_at,
sovereign_fqdn, load_balancer_ip, reservation_token, created_by
`, poolDomain, subdomain)
var a Allocation
var (
expiresAt *time.Time
sovereignFQDN *string
loadBalancerIP *string
reservationToken *uuid.UUID
)
err := row.Scan(
&a.PoolDomain,
&a.Subdomain,
&a.State,
&a.ReservedAt,
&expiresAt,
&sovereignFQDN,
&loadBalancerIP,
&reservationToken,
&a.CreatedBy,
)
if errors.Is(err, pgx.ErrNoRows) {
return nil, ErrNotFound
}
if err != nil {
return nil, fmt.Errorf("delete allocation: %w", err)
}
a.ExpiresAt = expiresAt
if sovereignFQDN != nil {
a.SovereignFQDN = *sovereignFQDN
}
if loadBalancerIP != nil {
a.LoadBalancerIP = *loadBalancerIP
}
if reservationToken != nil {
a.ReservationToken = reservationToken.String()
}
return &a, nil
}
// List returns every allocation for the given pool domain. Used by the
// operator-facing /list endpoint.
func (s *Store) List(ctx context.Context, poolDomain string) ([]Allocation, error) {
rows, err := s.pool.Query(ctx, `
SELECT pool_domain, subdomain, state, reserved_at, expires_at,
sovereign_fqdn, load_balancer_ip, reservation_token, created_by
FROM pool_allocations
WHERE pool_domain = $1
ORDER BY subdomain
`, poolDomain)
if err != nil {
return nil, fmt.Errorf("list allocations: %w", err)
}
defer rows.Close()
var out []Allocation
for rows.Next() {
var a Allocation
var (
expiresAt *time.Time
sovereignFQDN *string
loadBalancerIP *string
reservationToken *uuid.UUID
)
if err := rows.Scan(
&a.PoolDomain,
&a.Subdomain,
&a.State,
&a.ReservedAt,
&expiresAt,
&sovereignFQDN,
&loadBalancerIP,
&reservationToken,
&a.CreatedBy,
); err != nil {
return nil, fmt.Errorf("scan list row: %w", err)
}
a.ExpiresAt = expiresAt
if sovereignFQDN != nil {
a.SovereignFQDN = *sovereignFQDN
}
if loadBalancerIP != nil {
a.LoadBalancerIP = *loadBalancerIP
}
if reservationToken != nil {
a.ReservationToken = reservationToken.String()
}
out = append(out, a)
}
return out, rows.Err()
}
// ExpireReservations deletes every state='reserved' row whose expires_at is
// in the past. Returns the count of rows deleted. Called periodically by the
// sweeper goroutine.
func (s *Store) ExpireReservations(ctx context.Context) (int64, error) {
tag, err := s.pool.Exec(ctx, `
DELETE FROM pool_allocations
WHERE state = 'reserved'
AND expires_at < NOW()
`)
if err != nil {
return 0, fmt.Errorf("expire reservations: %w", err)
}
return tag.RowsAffected(), nil
}
// ErrTokenMismatch — the reservation_token in the request did not match
// the row's stored token. The caller likely is a stale tab.
var ErrTokenMismatch = errors.New("reservation token does not match")
// ErrExpired — the reservation TTL has elapsed; the caller must Reserve
// again before committing.
var ErrExpired = errors.New("reservation expired before commit")

View File

@ -0,0 +1,180 @@
package store
import (
"context"
"errors"
"os"
"testing"
"time"
"github.com/google/uuid"
)
// integrationDSN — set CI/local env to a writable Postgres for these tests.
// When unset, the integration tests skip; the rest of the package gets
// covered by allocator/handler tests with a thin in-memory shim. We
// deliberately don't pull testcontainers into the build path — Catalyst-
// Zero CI already runs Postgres as a service for other suites and the same
// container can host PDM's tests via PDM_TEST_DSN.
func integrationDSN(t *testing.T) string {
t.Helper()
dsn := os.Getenv("PDM_TEST_DSN")
if dsn == "" {
t.Skip("PDM_TEST_DSN not set — skipping integration test")
}
return dsn
}
func newTestStore(t *testing.T) *Store {
t.Helper()
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
s, err := New(ctx, integrationDSN(t))
if err != nil {
t.Fatalf("connect: %v", err)
}
t.Cleanup(func() {
// Truncate after each test so subsequent runs start clean.
_, _ = s.pool.Exec(context.Background(), `TRUNCATE pool_allocations`)
s.Close()
})
return s
}
func TestReserveHappyPath(t *testing.T) {
s := newTestStore(t)
ctx := context.Background()
a, err := s.Reserve(ctx, "omani.works", "tenant1", 10*time.Minute, "test")
if err != nil {
t.Fatalf("reserve: %v", err)
}
if a.State != StateReserved {
t.Errorf("state=%s want reserved", a.State)
}
if _, err := uuid.Parse(a.ReservationToken); err != nil {
t.Errorf("reservation token not a UUID: %v", err)
}
if a.ExpiresAt == nil || a.ExpiresAt.Before(time.Now().UTC()) {
t.Errorf("expiresAt must be in the future, got %v", a.ExpiresAt)
}
}
func TestReserveConflict(t *testing.T) {
s := newTestStore(t)
ctx := context.Background()
if _, err := s.Reserve(ctx, "omani.works", "tenant1", 10*time.Minute, "test"); err != nil {
t.Fatalf("first reserve: %v", err)
}
_, err := s.Reserve(ctx, "omani.works", "tenant1", 10*time.Minute, "test")
if !errors.Is(err, ErrConflict) {
t.Fatalf("second reserve: want ErrConflict, got %v", err)
}
}
func TestExpiryFreesName(t *testing.T) {
s := newTestStore(t)
ctx := context.Background()
if _, err := s.Reserve(ctx, "omani.works", "tenant1", 1*time.Millisecond, "test"); err != nil {
t.Fatalf("reserve: %v", err)
}
time.Sleep(50 * time.Millisecond)
// Reserve again — should succeed because the previous reservation has
// expired (the Reserve path prunes the expired row in the same tx).
a, err := s.Reserve(ctx, "omani.works", "tenant1", 10*time.Minute, "test")
if err != nil {
t.Fatalf("re-reserve after expiry: %v", err)
}
if a.State != StateReserved {
t.Errorf("state=%s want reserved", a.State)
}
}
func TestCommitFlipsState(t *testing.T) {
s := newTestStore(t)
ctx := context.Background()
r, err := s.Reserve(ctx, "omani.works", "tenant1", 10*time.Minute, "test")
if err != nil {
t.Fatalf("reserve: %v", err)
}
committed, err := s.Commit(ctx, "omani.works", "tenant1", CommitInput{
ReservationToken: r.ReservationToken,
SovereignFQDN: "tenant1.omani.works",
LoadBalancerIP: "1.2.3.4",
})
if err != nil {
t.Fatalf("commit: %v", err)
}
if committed.State != StateActive {
t.Errorf("state=%s want active", committed.State)
}
if committed.LoadBalancerIP != "1.2.3.4" {
t.Errorf("lbIP=%s", committed.LoadBalancerIP)
}
}
func TestCommitTokenMismatch(t *testing.T) {
s := newTestStore(t)
ctx := context.Background()
if _, err := s.Reserve(ctx, "omani.works", "tenant1", 10*time.Minute, "test"); err != nil {
t.Fatalf("reserve: %v", err)
}
_, err := s.Commit(ctx, "omani.works", "tenant1", CommitInput{
ReservationToken: uuid.NewString(),
SovereignFQDN: "tenant1.omani.works",
LoadBalancerIP: "1.2.3.4",
})
if !errors.Is(err, ErrTokenMismatch) {
t.Fatalf("commit: want ErrTokenMismatch, got %v", err)
}
}
func TestReleaseRemovesRow(t *testing.T) {
s := newTestStore(t)
ctx := context.Background()
r, err := s.Reserve(ctx, "omani.works", "tenant1", 10*time.Minute, "test")
if err != nil {
t.Fatalf("reserve: %v", err)
}
if _, err := s.Commit(ctx, "omani.works", "tenant1", CommitInput{
ReservationToken: r.ReservationToken,
SovereignFQDN: "tenant1.omani.works",
LoadBalancerIP: "1.2.3.4",
}); err != nil {
t.Fatalf("commit: %v", err)
}
freed, err := s.Release(ctx, "omani.works", "tenant1")
if err != nil {
t.Fatalf("release: %v", err)
}
if freed.State != StateActive {
t.Errorf("freed.State=%s want active (the previous state)", freed.State)
}
// Now the row is gone — Get must return ErrNotFound.
if _, err := s.Get(ctx, "omani.works", "tenant1"); !errors.Is(err, ErrNotFound) {
t.Errorf("after release: want ErrNotFound, got %v", err)
}
}
func TestExpireReservationsSweeper(t *testing.T) {
s := newTestStore(t)
ctx := context.Background()
if _, err := s.Reserve(ctx, "omani.works", "x", 1*time.Millisecond, "test"); err != nil {
t.Fatalf("reserve: %v", err)
}
time.Sleep(20 * time.Millisecond)
deleted, err := s.ExpireReservations(ctx)
if err != nil {
t.Fatalf("expire: %v", err)
}
if deleted != 1 {
t.Errorf("deleted=%d want 1", deleted)
}
}

View File

@ -0,0 +1,158 @@
# Composition: dynadot-pool-allocation.compose.openova.io — default
# realization for XDynadotPoolAllocation. Renders to a sequence of
# provider-http MR calls that mirror the wizard's imperative lifecycle:
#
# 1. POST http://pool-domain-manager.../api/v1/pool/{poolDomain}/reserve
# body: { subdomain, createdBy }
# → returns reservationToken
#
# 2. POST http://pool-domain-manager.../api/v1/pool/{poolDomain}/commit
# body: { subdomain, reservationToken, sovereignFQDN, loadBalancerIP }
#
# 3. (on delete) DELETE http://pool-domain-manager.../api/v1/pool/{poolDomain}/release
# body: { subdomain }
#
# Per docs/INVIOLABLE-PRINCIPLES.md principle #3 we use Crossplane's
# provider-http so the entire lifecycle is declarative — no bespoke Go,
# no exec.Command, no out-of-band shell scripts. The operator commits a
# DynadotPoolAllocation claim to Git and Flux + Crossplane converge.
#
# Provider-http reference:
# https://github.com/crossplane-contrib/provider-http
# We expect the cluster to have a ProviderConfig named 'pool-domain-manager'
# pointing at the in-cluster service URL — set up by Phase-0 cluster
# bootstrap (the Catalyst-Zero install ships this ProviderConfig as part
# of the platform's openova-system manifests).
apiVersion: apiextensions.crossplane.io/v1
kind: Composition
metadata:
name: dynadot-pool-allocation.compose.openova.io
labels:
catalyst.openova.io/component: crossplane
catalyst.openova.io/composition-family: pool-domain-manager
spec:
compositeTypeRef:
apiVersion: compose.openova.io/v1alpha1
kind: XDynadotPoolAllocation
writeConnectionSecretsToNamespace: crossplane-system
resources:
# ── 1. Reserve ────────────────────────────────────────────────────────
# Calls PDM /reserve. Provider-http stores the response body in the MR's
# status; the next step's request templating reads reservationToken
# from there.
- name: reserve
base:
apiVersion: http.crossplane.io/v1alpha2
kind: Request
spec:
forProvider:
url: "" # filled by patch
method: POST
headers:
Content-Type:
- application/json
payload:
baseUrl: "" # patched
body: "" # patched (subdomain + createdBy)
mappings:
- method: POST
action: CREATE
url: "" # patched
body: "" # patched
- method: DELETE
action: REMOVE
url: "" # patched (release endpoint)
body: "" # patched
providerConfigRef:
name: pool-domain-manager
patches:
# PDM URL: '<base>/api/v1/pool/<poolDomain>/reserve' on CREATE,
# '<base>/api/v1/pool/<poolDomain>/release' on DELETE. We default
# base to the in-cluster ClusterIP service so a stock Catalyst
# Sovereign bootstrap doesn't need any per-cluster overrides.
- fromFieldPath: spec.parameters.poolDomain
toFieldPath: spec.forProvider.mappings[0].url
transforms:
- type: string
string:
fmt: "http://pool-domain-manager.openova-system.svc.cluster.local:8080/api/v1/pool/%s/reserve"
- fromFieldPath: spec.parameters.poolDomain
toFieldPath: spec.forProvider.mappings[1].url
transforms:
- type: string
string:
fmt: "http://pool-domain-manager.openova-system.svc.cluster.local:8080/api/v1/pool/%s/release"
# Body for CREATE: { "subdomain": "<sub>", "createdBy": "crossplane" }
- fromFieldPath: spec.parameters.subdomain
toFieldPath: spec.forProvider.mappings[0].body
transforms:
- type: string
string:
fmt: '{"subdomain":"%s","createdBy":"crossplane"}'
- fromFieldPath: spec.parameters.subdomain
toFieldPath: spec.forProvider.mappings[1].body
transforms:
- type: string
string:
fmt: '{"subdomain":"%s"}'
# Surface the reservation token back onto the XR status so step 2
# can read it via fromConnectionSecret OR via direct status patch.
- type: ToCompositeFieldPath
fromFieldPath: status.response.body.reservationToken
toFieldPath: status.reservationToken
- type: ToCompositeFieldPath
fromFieldPath: status.response.body.state
toFieldPath: status.state
- type: ToCompositeFieldPath
fromFieldPath: status.response.body.expiresAt
toFieldPath: status.expiresAt
# ── 2. Commit ─────────────────────────────────────────────────────────
# Calls PDM /commit. Depends on the reservation token surfaced by
# step 1. provider-http evaluates resources in order, so by the time
# this MR runs the XR's status.reservationToken is populated.
- name: commit
base:
apiVersion: http.crossplane.io/v1alpha2
kind: Request
spec:
forProvider:
mappings:
- method: POST
action: CREATE
url: "" # patched
body: "" # patched
headers:
Content-Type:
- application/json
providerConfigRef:
name: pool-domain-manager
patches:
- fromFieldPath: spec.parameters.poolDomain
toFieldPath: spec.forProvider.mappings[0].url
transforms:
- type: string
string:
fmt: "http://pool-domain-manager.openova-system.svc.cluster.local:8080/api/v1/pool/%s/commit"
# Body composition: we need subdomain, reservationToken,
# sovereignFQDN, loadBalancerIP. Crossplane's standard patches
# don't support multi-source string interpolation in a single
# transform, so we use a CombineFromComposite block.
- type: CombineFromComposite
combine:
variables:
- fromFieldPath: spec.parameters.subdomain
- fromFieldPath: status.reservationToken
- fromFieldPath: spec.parameters.sovereignFQDN
- fromFieldPath: spec.parameters.loadBalancerIP
strategy: string
string:
fmt: '{"subdomain":"%s","reservationToken":"%s","sovereignFQDN":"%s","loadBalancerIP":"%s"}'
toFieldPath: spec.forProvider.mappings[0].body
# On commit success the row flips to active.
- type: ToCompositeFieldPath
fromFieldPath: status.response.body.state
toFieldPath: status.state

View File

@ -0,0 +1,128 @@
# XRD: XDynadotPoolAllocation — Crossplane wrapper over pool-domain-manager.
#
# The PDM HTTP API (introduced in #163) is the SOLE caller of api.dynadot.com
# in the OpenOva fleet. Most call paths go via catalyst-api during the
# wizard's lifecycle. This XR exists for the operator-driven path: an
# operator who wants to PRE-allocate a name (e.g. reserve `omantel.omani.works`
# for a Sovereign that hasn't been provisioned yet) commits a YAML to Git
# and Crossplane converges PDM to match.
#
# Per docs/BLUEPRINT-AUTHORING.md §8 the canonical XRD group is
# compose.openova.io/v1alpha1 — NOT catalyst.openova.io.
#
# Per docs/INVIOLABLE-PRINCIPLES.md principle #3 this Composition is the
# ONLY way operators declaratively manage pool allocations. The wizard's
# imperative path (catalyst-api → PDM HTTP) and this declarative path both
# write to the same PDM database — there is one source of truth.
#
# Per docs/INVIOLABLE-PRINCIPLES.md principle #4 every value
# (poolDomain, subdomain, sovereignFQDN, loadBalancerIP) is a schema field —
# no hardcoded names.
apiVersion: apiextensions.crossplane.io/v1
kind: CompositeResourceDefinition
metadata:
name: xdynadotpoolallocations.compose.openova.io
labels:
catalyst.openova.io/component: crossplane
catalyst.openova.io/composition-family: pool-domain-manager
spec:
group: compose.openova.io
names:
kind: XDynadotPoolAllocation
plural: xdynadotpoolallocations
claimNames:
kind: DynadotPoolAllocation
plural: dynadotpoolallocations
defaultCompositionRef:
name: dynadot-pool-allocation.compose.openova.io
versions:
- name: v1alpha1
served: true
referenceable: true
schema:
openAPIV3Schema:
type: object
required: [spec]
properties:
spec:
type: object
required: [parameters]
properties:
parameters:
type: object
required: [poolDomain, subdomain, sovereignFQDN, loadBalancerIP]
properties:
poolDomain:
type: string
description: |
OpenOva-managed pool domain (e.g. omani.works,
openova.io). Must appear in PDM's runtime
DYNADOT_MANAGED_DOMAINS env var or the Composition
will fail to reserve.
pattern: '^[a-z0-9][a-z0-9.-]+[a-z0-9]$'
subdomain:
type: string
description: |
Single RFC 1035 label (a-z, 0-9, hyphens). The
Composition reserves '<subdomain>.<poolDomain>' in
PDM and writes the canonical 6-record set to
Dynadot at commit time.
pattern: '^[a-z][a-z0-9-]{0,61}[a-z0-9]$'
sovereignFQDN:
type: string
description: |
Final FQDN the Sovereign answers on. Usually
'<subdomain>.<poolDomain>' but can differ for
operator-staged migrations.
loadBalancerIP:
type: string
description: |
IPv4 of the Hetzner Load Balancer that fronts the
Sovereign. Operator obtains this from the prior
XHetznerLoadBalancer status.loadBalancerIP.
pattern: '^([0-9]{1,3}\.){3}[0-9]{1,3}$'
createdBy:
type: string
description: |
Audit-trail string written into pool_allocations.
Defaults to 'crossplane' when omitted.
description: |
PDM allocation parameters. The full lifecycle (reserve →
commit) is encoded in the Composition; the operator only
needs to supply these four fields plus the optional
audit-trail attribution.
status:
type: object
properties:
state:
type: string
description: PDM-reported state (reserved | active).
enum: [reserved, active]
expiresAt:
type: string
format: date-time
description: TTL deadline when state=reserved.
reservationToken:
type: string
format: uuid
description: |
Returned by /reserve; consumed by /commit. Operators
don't typically read this — Crossplane chains the calls
automatically — but it is exposed for debugging.
additionalPrinterColumns:
- name: POOL
type: string
jsonPath: .spec.parameters.poolDomain
- name: SUBDOMAIN
type: string
jsonPath: .spec.parameters.subdomain
- name: STATE
type: string
jsonPath: .status.state
- name: LB-IP
type: string
jsonPath: .spec.parameters.loadBalancerIP
- name: AGE
type: date
jsonPath: .metadata.creationTimestamp

View File

@ -16,6 +16,7 @@ import (
"crypto/rand"
"encoding/hex"
"encoding/json"
"errors"
"fmt"
"net/http"
"sync"
@ -23,6 +24,7 @@ import (
"github.com/go-chi/chi/v5"
"github.com/openova-io/openova/products/catalyst/bootstrap/api/internal/pdm"
"github.com/openova-io/openova/products/catalyst/bootstrap/api/internal/provisioner"
)
@ -37,6 +39,17 @@ type Deployment struct {
FinishedAt time.Time
Events chan provisioner.Event
mu sync.Mutex
// PDM reservation captured before `tofu apply` for managed-pool
// deployments. The reservationToken is held until `tofu apply`
// returns the LB IP, at which point we POST it to PDM /commit. On
// `tofu destroy` (or a phase-0 retry that decides to abandon) we
// DELETE /release.
//
// Empty for BYO deployments — those keep their own DNS off-platform.
pdmReservationToken string
pdmPoolDomain string
pdmSubdomain string
}
// State returns a JSON-safe snapshot for the GET endpoint.
@ -92,6 +105,47 @@ func (h *Handler) CreateDeployment(w http.ResponseWriter, r *http.Request) {
StartedAt: time.Now(),
Events: make(chan provisioner.Event, 256),
}
// Reserve the pool subdomain via PDM BEFORE we kick off `tofu apply`.
// PDM holds the name with a TTL — if `tofu apply` fails or this catalyst-
// api Pod crashes, the TTL expires and the name is freed automatically.
// On the success path the runProvisioning goroutine calls /commit with
// the LB IP, which flips the reservation to ACTIVE and writes the
// Dynadot DNS records.
//
// For BYO deployments (the customer owns the DNS zone) we skip PDM
// entirely — the customer points their own CNAME at the LB IP shown
// on the success screen.
if req.SovereignDomainMode == "pool" && pdm.IsManagedDomain(req.SovereignPoolDomain) {
if h.pdm == nil {
writeJSON(w, http.StatusInternalServerError, map[string]string{
"error": "pool-domain-manager client is not configured (POOL_DOMAIN_MANAGER_URL)",
})
return
}
reserveCtx, reserveCancel := context.WithTimeout(r.Context(), 10*time.Second)
reservation, reserveErr := h.pdm.Reserve(reserveCtx, req.SovereignPoolDomain, req.SovereignSubdomain, "catalyst-api/deployment-"+id)
reserveCancel()
if reserveErr != nil {
if errors.Is(reserveErr, pdm.ErrConflict) {
writeJSON(w, http.StatusConflict, map[string]string{
"error": "subdomain-conflict",
"detail": "this subdomain has been reserved or activated for the chosen pool — pick a different name",
})
return
}
h.log.Error("pdm reserve failed", "id", id, "err", reserveErr)
writeJSON(w, http.StatusServiceUnavailable, map[string]string{
"error": "pdm-unavailable",
"detail": "pool-domain-manager is temporarily unreachable: " + reserveErr.Error(),
})
return
}
dep.pdmReservationToken = reservation.ReservationToken
dep.pdmPoolDomain = reservation.PoolDomain
dep.pdmSubdomain = reservation.Subdomain
}
h.deployments.Store(id, dep)
// Capture status before launching the goroutine — runProvisioning races
@ -181,6 +235,48 @@ func (h *Handler) runProvisioning(dep *Deployment) {
)
}
dep.mu.Unlock()
// PDM lifecycle: on success, /commit with the LB IP; on failure, /release
// so the reservation TTL doesn't have to expire to free the name. PDM is
// the single owner of the Dynadot side-effect (it is also responsible for
// AddSovereignRecords on commit; catalyst-api never writes DNS itself).
if dep.pdmReservationToken != "" && h.pdm != nil {
pdmCtx, pdmCancel := context.WithTimeout(context.Background(), 30*time.Second)
defer pdmCancel()
if err == nil && result != nil {
commitErr := h.pdm.Commit(pdmCtx, dep.pdmPoolDomain, pdm.CommitInput{
Subdomain: dep.pdmSubdomain,
ReservationToken: dep.pdmReservationToken,
SovereignFQDN: result.SovereignFQDN,
LoadBalancerIP: result.LoadBalancerIP,
})
if commitErr != nil {
h.log.Error("pdm commit failed; sovereign is live but DNS records may be stale",
"id", dep.ID,
"poolDomain", dep.pdmPoolDomain,
"subdomain", dep.pdmSubdomain,
"err", commitErr,
)
} else {
h.log.Info("pdm commit complete",
"id", dep.ID,
"poolDomain", dep.pdmPoolDomain,
"subdomain", dep.pdmSubdomain,
"loadBalancerIP", result.LoadBalancerIP,
)
}
} else {
releaseErr := h.pdm.Release(pdmCtx, dep.pdmPoolDomain, dep.pdmSubdomain)
if releaseErr != nil && !errors.Is(releaseErr, pdm.ErrNotFound) {
h.log.Error("pdm release failed; reservation will expire on TTL",
"id", dep.ID,
"poolDomain", dep.pdmPoolDomain,
"subdomain", dep.pdmSubdomain,
"err", releaseErr,
)
}
}
}
}
func newID() string {

View File

@ -0,0 +1,122 @@
// Tests for the catalyst-api → PDM lifecycle: reserve before tofu apply,
// commit on success, release on failure. These cover the deployment-level
// path #163 introduced — the wizard creates a deployment, PDM holds the
// reservation while tofu runs, and PDM owns the eventual DNS write.
package handler
import (
"bytes"
"context"
"encoding/json"
"log/slog"
"net/http"
"net/http/httptest"
"testing"
"github.com/openova-io/openova/products/catalyst/bootstrap/api/internal/pdm"
)
func TestCreateDeployment_ManagedPoolReservesViaPDM(t *testing.T) {
t.Setenv("DYNADOT_MANAGED_DOMAINS", "omani.works")
pdm.ResetManagedDomains()
fake := &fakePDM{}
h := NewWithPDM(slog.Default(), fake)
body, _ := json.Marshal(map[string]any{
"sovereignFQDN": "omantel.omani.works",
"sovereignDomainMode": "pool",
"sovereignPoolDomain": "omani.works",
"sovereignSubdomain": "omantel",
"hetznerToken": "tok",
"hetznerProjectID": "proj",
"region": "fsn1",
"orgName": "Omantel",
"orgEmail": "ops@omantel.om",
"sshPublicKey": "ssh-ed25519 AAAA test",
})
w := httptest.NewRecorder()
r := httptest.NewRequest(http.MethodPost, "/api/v1/deployments", bytes.NewReader(body))
h.CreateDeployment(w, r)
// 201 — deployment row created. The runProvisioning goroutine is
// launched in a background goroutine; in this unit test the goroutine
// will fail at tofu exec (not installed) but for this test we only
// care that CreateDeployment reserved before launching it.
if w.Code != http.StatusCreated {
t.Fatalf("status=%d body=%s", w.Code, w.Body.String())
}
if len(fake.reserves) != 1 {
t.Fatalf("expected 1 PDM reserve, got %d", len(fake.reserves))
}
if fake.reserves[0].pool != "omani.works" || fake.reserves[0].sub != "omantel" {
t.Errorf("reserve called with wrong args: %+v", fake.reserves[0])
}
}
func TestCreateDeployment_PDMConflictBlocksDeployment(t *testing.T) {
t.Setenv("DYNADOT_MANAGED_DOMAINS", "omani.works")
pdm.ResetManagedDomains()
fake := &fakePDM{
reserve: func(ctx context.Context, pool, sub, by string) (*pdm.Reservation, error) {
return nil, pdm.ErrConflict
},
}
h := NewWithPDM(slog.Default(), fake)
body, _ := json.Marshal(map[string]any{
"sovereignFQDN": "omantel.omani.works",
"sovereignDomainMode": "pool",
"sovereignPoolDomain": "omani.works",
"sovereignSubdomain": "omantel",
"hetznerToken": "tok",
"hetznerProjectID": "proj",
"region": "fsn1",
"orgName": "Omantel",
"orgEmail": "ops@omantel.om",
"sshPublicKey": "ssh-ed25519 AAAA test",
})
w := httptest.NewRecorder()
r := httptest.NewRequest(http.MethodPost, "/api/v1/deployments", bytes.NewReader(body))
h.CreateDeployment(w, r)
if w.Code != http.StatusConflict {
t.Fatalf("status=%d want 409 (subdomain-conflict), body=%s", w.Code, w.Body.String())
}
}
func TestCreateDeployment_BYODoesNotReserve(t *testing.T) {
t.Setenv("DYNADOT_MANAGED_DOMAINS", "omani.works")
pdm.ResetManagedDomains()
fake := &fakePDM{}
h := NewWithPDM(slog.Default(), fake)
body, _ := json.Marshal(map[string]any{
"sovereignFQDN": "k8s.acme.io",
"sovereignDomainMode": "byo",
"sovereignPoolDomain": "acme.io",
"sovereignSubdomain": "k8s",
"hetznerToken": "tok",
"hetznerProjectID": "proj",
"region": "fsn1",
"orgName": "Acme",
"orgEmail": "ops@acme.io",
"sshPublicKey": "ssh-ed25519 AAAA test",
})
w := httptest.NewRecorder()
r := httptest.NewRequest(http.MethodPost, "/api/v1/deployments", bytes.NewReader(body))
h.CreateDeployment(w, r)
if w.Code != http.StatusCreated {
t.Fatalf("status=%d body=%s", w.Code, w.Body.String())
}
// BYO must NOT consult PDM — the customer owns DNS.
if len(fake.reserves) != 0 {
t.Errorf("BYO reserved via PDM unexpectedly: %+v", fake.reserves)
}
}

View File

@ -1,3 +1,4 @@
// Package handler holds shared state for all HTTP handlers.
package handler
import (
@ -6,28 +7,62 @@ import (
"net/http"
"os"
"sync"
"github.com/openova-io/openova/products/catalyst/bootstrap/api/internal/pdm"
)
// Handler holds shared state for all HTTP handlers.
//
// dynadotAPIKey + dynadotAPISecret are read from environment variables that
// are mounted from the dynadot-api-credentials K8s secret in the
// openova-system namespace via ESO at deploy time. They are injected into
// pool-domain ProvisionRequests so the provisioner can write DNS records
// for *.{subdomain}.{pool-domain}.
// dynadotAPIKey + dynadotAPISecret remain on the Handler so the OpenTofu
// module's `dynadot_*` variables can still receive credentials for the
// Phase-0 DNS bootstrap that runs at first `tofu apply` time. After #163
// Phase 4 lands the Crossplane Composition that wraps PDM as a declarative
// MR, even those fields go away (PDM holds the credentials; catalyst-api
// merely calls PDM via the in-cluster service FQDN).
//
// pdm is the central authority for OpenOva-pool subdomain allocation
// (introduced by #163). catalyst-api never calls api.dynadot.com directly
// for the availability check / reservation lifecycle after this lands —
// every interaction with the Dynadot zone flows through PDM.
type Handler struct {
log *slog.Logger
deployments sync.Map // map[string]*Deployment
dynadotAPIKey string
dynadotAPISecret string
// pdm — pool-domain-manager client. Required in production; tests can
// inject a fake via NewWithPDM. The default URL points at the in-cluster
// service FQDN so a stock Catalyst-Zero deployment "just works" without
// per-pod configuration.
pdm pdmClient
}
// New creates a Handler.
// New creates a Handler with the runtime configuration loaded from env.
//
// POOL_DOMAIN_MANAGER_URL — defaults to the in-cluster service FQDN. Per
// docs/INVIOLABLE-PRINCIPLES.md #4 the URL is configuration, not code; an
// air-gapped install can override it to point at the operator's own
// PDM endpoint.
func New(log *slog.Logger) *Handler {
pdmURL := os.Getenv("POOL_DOMAIN_MANAGER_URL")
if pdmURL == "" {
pdmURL = "http://pool-domain-manager.openova-system.svc.cluster.local:8080"
}
return &Handler{
log: log,
dynadotAPIKey: os.Getenv("DYNADOT_API_KEY"),
dynadotAPISecret: os.Getenv("DYNADOT_API_SECRET"),
pdm: pdm.New(pdmURL),
}
}
// NewWithPDM is exposed for tests; production code uses New.
func NewWithPDM(log *slog.Logger, client pdmClient) *Handler {
return &Handler{
log: log,
dynadotAPIKey: os.Getenv("DYNADOT_API_KEY"),
dynadotAPISecret: os.Getenv("DYNADOT_API_SECRET"),
pdm: client,
}
}

View File

@ -1,39 +1,34 @@
// Package handler — subdomains.go: pre-submit availability check.
//
// Closes #124 ([I] ux: error handling — what happens if subdomain already
// taken). The wizard's StepOrg debounces keystrokes and POSTs the
// candidate subdomain + pool-domain pair here BEFORE the user clicks
// Next, so the validator catches collisions early instead of failing
// at provisioning time when Dynadot rejects the duplicate record.
// Closes the DNS-wildcard regression in #163 by routing every check for an
// OpenOva-managed pool domain through pool-domain-manager (PDM). PDM is the
// authoritative allocation source — it does not consult DNS at all, so the
// Dynadot wildcard parking record at the apex of omani.works (which made
// EVERY subdomain resolve to 185.53.179.128 and broke the previous
// LookupHost-based check) is now architecturally irrelevant for managed
// pools.
//
// How "taken" is determined:
// Decision tree per request:
//
// 1. Pool-domain check — only OpenOva-managed pool domains are
// candidates for this endpoint; BYO domains are the customer's
// responsibility. We reject any pool the wizard doesn't recognise
// (defence-in-depth — the wizard already filters its own dropdown,
// but the handler must validate independently).
// 1. Validate the subdomain as an RFC 1035 label (cheap, local).
// 2. If poolDomain is in the runtime DYNADOT_MANAGED_DOMAINS list →
// delegate to PDM via Client.Check. PDM owns the reserved-name list
// and the allocation table; we just surface its response verbatim.
// 3. Otherwise the caller is asking about a BYO domain (a customer's own
// DNS zone) — fall back to a DNS-based check via net.LookupHost. PDM
// doesn't manage BYO zones; the customer's nameserver IS the source
// of truth there.
//
// 2. Reserved-name check — short list of RFC 1035 / OpenOva
// Sovereign-control-plane subdomains we never let a tenant claim
// (api, admin, console, gitea, harbor, www, mail). Tenants get
// *those* names automatically as part of the Sovereign FQDN
// structure once they pick their root subdomain.
// Per docs/INVIOLABLE-PRINCIPLES.md #4: PDM's URL is read from the
// POOL_DOMAIN_MANAGER_URL env var (default = in-cluster service FQDN). The
// reserved-name list lives ONLY in PDM after this commit — catalyst-api no
// longer maintains a copy.
//
// 3. DNS resolution — net.DefaultResolver.LookupHost on
// "<subdomain>.<pool>" with a 2-second timeout. If anything
// resolves, the name is considered taken (whether it's an A,
// AAAA, or CNAME, the global DNS already knows about it).
//
// Per docs/INVIOLABLE-PRINCIPLES.md #4 ("never hardcode") the pool
// list is shared with the package-level IsManagedDomain check in
// internal/dynadot/. The reserved-name list is centralised here.
//
// Per the auto-memory `feedback_dynadot_dns.md`: NEVER run exploratory
// set_dns2 calls. We deliberately do NOT call Dynadot's API for the
// availability check — Dynadot's API is write-only-safe. The global
// DNS resolver is the eventually-consistent source of truth for what
// names already point somewhere.
// Per Lesson #24 in docs/INVIOLABLE-PRINCIPLES.md: this is a STRUCTURAL fix,
// not a bandaid. The previous DNS-based path is REMOVED for managed pools,
// not augmented. The only remaining net.LookupHost call lives in the BYO
// branch — and it is the right tool there because BYO zones are owned by
// the customer, not by OpenOva.
package handler
import (
@ -45,37 +40,9 @@ import (
"strings"
"time"
"github.com/openova-io/openova/products/catalyst/bootstrap/api/internal/dynadot"
"github.com/openova-io/openova/products/catalyst/bootstrap/api/internal/pdm"
)
// reservedSubdomains — names we never let a tenant claim as their
// Sovereign root subdomain. Tenants get *.omantel.omani.works style
// records automatically; the wizard prevents claiming any name that
// would collide with the canonical control-plane sub-records.
var reservedSubdomains = map[string]struct{}{
"api": {},
"admin": {},
"console": {},
"gitea": {},
"harbor": {},
"keycloak": {},
"www": {},
"mail": {},
"smtp": {},
"imap": {},
"vpn": {},
"openova": {},
"catalyst": {},
"docs": {},
"status": {},
"app": {},
"system": {},
"openbao": {},
"vault": {},
"flux": {},
"k8s": {},
}
type subdomainCheckRequest struct {
Subdomain string `json:"subdomain"`
// PoolDomain — the apex pool domain (e.g. "omani.works"), NOT the
@ -88,14 +55,19 @@ type subdomainCheckRequest struct {
//
// available=true → subdomain is free, user can submit.
// available=false → subdomain is taken; reason explains why.
// (no error field) → backend reached resolver / pool list cleanly.
//
// reason values:
// reason values (managed pools mirror PDM verbatim, BYO uses local strings):
// "invalid-format" subdomain is not a valid RFC 1035 label
// "unsupported-pool" poolDomain is not an OpenOva-managed pool
// "reserved" subdomain is in reservedSubdomains
// "exists" DNS resolver returned at least one record
// "lookup-error" DNS lookup itself failed (transient — user retries)
// "unsupported-pool" poolDomain is not an OpenOva-managed pool (PDM)
// — only surfaced for the BYO path's sanity check;
// managed-pool requests delegate to PDM which owns
// this verdict.
// "reserved" subdomain is in PDM's reserved list (managed)
// "reserved-state" PDM holds a non-expired reservation (managed)
// "active-state" PDM has an active allocation (managed)
// "exists" BYO DNS resolver returned at least one record
// "lookup-error" BYO DNS lookup itself failed (transient)
// "pdm-unavailable" PDM call failed — wizard treats as transient
type SubdomainCheckResponse struct {
Available bool `json:"available"`
Reason string `json:"reason,omitempty"`
@ -137,36 +109,71 @@ func (h *Handler) CheckSubdomain(w http.ResponseWriter, r *http.Request) {
return
}
if !dynadot.IsManagedDomain(pool) {
// Managed pools — PDM is the authoritative source of truth.
if pdm.IsManagedDomain(pool) {
h.checkManagedPool(w, r.Context(), pool, sub)
return
}
// BYO domain — fall back to the legacy DNS-based check. The customer
// owns the zone; resolving the name is the only signal we have.
h.checkBYO(w, r.Context(), pool, sub)
}
// checkManagedPool delegates to PDM. We surface PDM's response verbatim
// (available, reason, detail, fqdn) so the wizard can render PDM's
// authoritative messages without an extra mapping layer.
func (h *Handler) checkManagedPool(w http.ResponseWriter, ctx context.Context, pool, sub string) {
if h.pdm == nil {
// Defence-in-depth: if the deployment forgot POOL_DOMAIN_MANAGER_URL,
// surface a transient error rather than silently falling back to DNS
// (which would resurrect the wildcard-parking bug this file exists
// to fix).
writeJSON(w, http.StatusOK, SubdomainCheckResponse{
Available: false,
Reason: "unsupported-pool",
Detail: "pool domain " + pool + " is not managed by OpenOva — pick a different pool or use BYO",
Reason: "pdm-unavailable",
Detail: "pool-domain-manager client is not configured — operator must set POOL_DOMAIN_MANAGER_URL",
FQDN: sub + "." + pool,
})
return
}
if _, taken := reservedSubdomains[sub]; taken {
pdmCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
defer cancel()
res, err := h.pdm.Check(pdmCtx, pool, sub)
if err != nil {
h.log.Error("pdm check failed", "pool", pool, "sub", sub, "err", err)
writeJSON(w, http.StatusOK, SubdomainCheckResponse{
Available: false,
Reason: "reserved",
Detail: "this subdomain is reserved for the Sovereign control plane — pick a different name",
Reason: "pdm-unavailable",
Detail: "pool-domain-manager is temporarily unreachable — try again",
FQDN: sub + "." + pool,
})
return
}
writeJSON(w, http.StatusOK, SubdomainCheckResponse{
Available: res.Available,
Reason: res.Reason,
Detail: res.Detail,
FQDN: res.FQDN,
})
}
// checkBYO performs the DNS-based availability check for customer-owned
// (Bring-Your-Own) domains. PDM doesn't manage BYO zones — the customer's
// nameserver is the source of truth — so net.LookupHost is the right
// primitive here.
func (h *Handler) checkBYO(w http.ResponseWriter, ctx context.Context, pool, sub string) {
fqdn := sub + "." + pool
// Two-second timeout — long enough for global DNS but short enough
// that the wizard's debounced keystroke loop stays responsive.
ctx, cancel := context.WithTimeout(r.Context(), 2*time.Second)
dnsCtx, cancel := context.WithTimeout(ctx, 2*time.Second)
defer cancel()
addrs, err := net.DefaultResolver.LookupHost(ctx, fqdn)
addrs, err := net.DefaultResolver.LookupHost(dnsCtx, fqdn)
if err != nil {
// NXDOMAIN is "not taken" — the most common, success case. Any
// other error class (timeout, server-fail) is a transient lookup
// problem the wizard surfaces but doesn't treat as taken.
if isNXDomain(err) {
writeJSON(w, http.StatusOK, SubdomainCheckResponse{
Available: true,
@ -182,7 +189,6 @@ func (h *Handler) CheckSubdomain(w http.ResponseWriter, r *http.Request) {
})
return
}
if len(addrs) == 0 {
writeJSON(w, http.StatusOK, SubdomainCheckResponse{
Available: true,
@ -190,7 +196,6 @@ func (h *Handler) CheckSubdomain(w http.ResponseWriter, r *http.Request) {
})
return
}
writeJSON(w, http.StatusOK, SubdomainCheckResponse{
Available: false,
Reason: "exists",
@ -235,3 +240,12 @@ func isNXDomain(err error) bool {
}
return false
}
// pdmClient is implemented by *pdm.Client. The interface lets us pass a
// fake in tests without wiring a real HTTP server.
type pdmClient interface {
Check(ctx context.Context, poolDomain, subdomain string) (*pdm.CheckResult, error)
Reserve(ctx context.Context, poolDomain, subdomain, createdBy string) (*pdm.Reservation, error)
Commit(ctx context.Context, poolDomain string, in pdm.CommitInput) error
Release(ctx context.Context, poolDomain, subdomain string) error
}

View File

@ -0,0 +1,260 @@
// Tests for subdomains.go — the catalyst-api side of the PDM contract.
// These cover three architectural invariants:
//
// 1. Managed pools NEVER call net.LookupHost. The DNS-wildcard parking
// record at omani.works (which previously made every subdomain
// resolve to 185.53.179.128) cannot cause a false positive when
// the pool is in DYNADOT_MANAGED_DOMAINS — PDM is the single source
// of truth.
// 2. BYO domains use net.LookupHost — the customer's nameserver is
// authoritative; PDM doesn't manage their zone.
// 3. The PDM client is consulted exactly once per managed-pool request,
// with the response surfaced verbatim.
//
// These guarantees are what prevent the regression #163 was opened for.
package handler
import (
"context"
"encoding/json"
"errors"
"io"
"log/slog"
"net/http"
"net/http/httptest"
"strings"
"testing"
"github.com/openova-io/openova/products/catalyst/bootstrap/api/internal/pdm"
)
// fakePDM is a stub pdmClient that records every call. We assert against the
// recorded calls to prove the behaviour the architecture requires.
type fakePDM struct {
checks []checkCall
check func(ctx context.Context, pool, sub string) (*pdm.CheckResult, error)
reserves []reserveCall
reserve func(ctx context.Context, pool, sub, by string) (*pdm.Reservation, error)
commits []pdm.CommitInput
commit func(ctx context.Context, pool string, in pdm.CommitInput) error
releases []releaseCall
release func(ctx context.Context, pool, sub string) error
}
type checkCall struct{ pool, sub string }
type reserveCall struct{ pool, sub, by string }
type releaseCall struct{ pool, sub string }
func (f *fakePDM) Check(ctx context.Context, pool, sub string) (*pdm.CheckResult, error) {
f.checks = append(f.checks, checkCall{pool, sub})
if f.check != nil {
return f.check(ctx, pool, sub)
}
return &pdm.CheckResult{Available: true, FQDN: sub + "." + pool}, nil
}
func (f *fakePDM) Reserve(ctx context.Context, pool, sub, by string) (*pdm.Reservation, error) {
f.reserves = append(f.reserves, reserveCall{pool, sub, by})
if f.reserve != nil {
return f.reserve(ctx, pool, sub, by)
}
return &pdm.Reservation{
PoolDomain: pool, Subdomain: sub, State: "reserved",
ReservationToken: "00000000-0000-0000-0000-000000000000",
}, nil
}
func (f *fakePDM) Commit(ctx context.Context, pool string, in pdm.CommitInput) error {
f.commits = append(f.commits, in)
if f.commit != nil {
return f.commit(ctx, pool, in)
}
return nil
}
func (f *fakePDM) Release(ctx context.Context, pool, sub string) error {
f.releases = append(f.releases, releaseCall{pool, sub})
if f.release != nil {
return f.release(ctx, pool, sub)
}
return nil
}
func decodeResp(t *testing.T, body io.Reader) SubdomainCheckResponse {
t.Helper()
var got SubdomainCheckResponse
raw, _ := io.ReadAll(body)
if err := json.Unmarshal(raw, &got); err != nil {
t.Fatalf("decode response: %v (body=%s)", err, string(raw))
}
return got
}
func makeRequest(body string) *http.Request {
r := httptest.NewRequest(http.MethodPost, "/api/v1/subdomains/check", strings.NewReader(body))
r.Header.Set("Content-Type", "application/json")
return r
}
func TestCheckSubdomain_ManagedPoolDelegatesToPDM(t *testing.T) {
t.Setenv("DYNADOT_MANAGED_DOMAINS", "omani.works,openova.io")
pdm.ResetManagedDomains()
fake := &fakePDM{
check: func(ctx context.Context, pool, sub string) (*pdm.CheckResult, error) {
return &pdm.CheckResult{Available: true, FQDN: sub + "." + pool}, nil
},
}
h := NewWithPDM(slog.Default(), fake)
w := httptest.NewRecorder()
h.CheckSubdomain(w, makeRequest(`{"subdomain":"dadasg4543sdfs","poolDomain":"omani.works"}`))
if w.Code != http.StatusOK {
t.Fatalf("status=%d body=%s", w.Code, w.Body.String())
}
got := decodeResp(t, w.Body)
if !got.Available {
t.Errorf("Available=false body=%+v", got)
}
if len(fake.checks) != 1 || fake.checks[0].pool != "omani.works" || fake.checks[0].sub != "dadasg4543sdfs" {
t.Errorf("expected single PDM check call, got %+v", fake.checks)
}
}
// The architectural invariant: even if the customer happens to type a
// random string that "would" resolve via the omani.works wildcard, PDM
// (which has no DNS dependency) returns Available=true — i.e. the
// wildcard parking record is NEVER consulted on the managed-pool path.
func TestCheckSubdomain_WildcardParkingIsIgnored(t *testing.T) {
t.Setenv("DYNADOT_MANAGED_DOMAINS", "omani.works")
pdm.ResetManagedDomains()
fake := &fakePDM{
check: func(ctx context.Context, pool, sub string) (*pdm.CheckResult, error) {
// PDM has nothing in its allocation table — it returns
// Available=true regardless of what DNS says.
return &pdm.CheckResult{Available: true, FQDN: sub + "." + pool}, nil
},
}
h := NewWithPDM(slog.Default(), fake)
for _, sub := range []string{"foo", "dadasg4543sdfs", "totally-random-name"} {
w := httptest.NewRecorder()
h.CheckSubdomain(w, makeRequest(`{"subdomain":"`+sub+`","poolDomain":"omani.works"}`))
got := decodeResp(t, w.Body)
if !got.Available {
t.Errorf("sub=%s: Available=false (wildcard regression!): %+v", sub, got)
}
}
// Exactly one check per call — PDM is consulted, DNS is not.
if len(fake.checks) != 3 {
t.Errorf("expected 3 PDM checks, got %d", len(fake.checks))
}
}
func TestCheckSubdomain_ManagedPoolPDMConflict(t *testing.T) {
t.Setenv("DYNADOT_MANAGED_DOMAINS", "omani.works")
pdm.ResetManagedDomains()
fake := &fakePDM{
check: func(ctx context.Context, pool, sub string) (*pdm.CheckResult, error) {
return &pdm.CheckResult{
Available: false,
Reason: "active-state",
Detail: "this subdomain is already taken by a live Sovereign — pick a different name",
FQDN: sub + "." + pool,
}, nil
},
}
h := NewWithPDM(slog.Default(), fake)
w := httptest.NewRecorder()
h.CheckSubdomain(w, makeRequest(`{"subdomain":"omantel","poolDomain":"omani.works"}`))
got := decodeResp(t, w.Body)
if got.Available {
t.Fatalf("expected unavailable, got %+v", got)
}
if got.Reason != "active-state" {
t.Errorf("Reason=%s want active-state", got.Reason)
}
}
func TestCheckSubdomain_BYOFallsBackToDNS(t *testing.T) {
t.Setenv("DYNADOT_MANAGED_DOMAINS", "omani.works")
pdm.ResetManagedDomains()
fake := &fakePDM{}
h := NewWithPDM(slog.Default(), fake)
// Pick a domain that is guaranteed to be in DNS — example.com always
// resolves. The handler should call LookupHost and surface "exists".
w := httptest.NewRecorder()
h.CheckSubdomain(w, makeRequest(`{"subdomain":"www","poolDomain":"example.com"}`))
got := decodeResp(t, w.Body)
if got.Available {
t.Errorf("www.example.com should resolve and be unavailable: %+v", got)
}
if len(fake.checks) != 0 {
t.Errorf("BYO path must NOT consult PDM; got %d checks", len(fake.checks))
}
}
func TestCheckSubdomain_BYONXDomainAvailable(t *testing.T) {
t.Setenv("DYNADOT_MANAGED_DOMAINS", "omani.works")
pdm.ResetManagedDomains()
fake := &fakePDM{}
h := NewWithPDM(slog.Default(), fake)
// A guaranteed-NXDOMAIN under example.com (RFC 6761).
w := httptest.NewRecorder()
h.CheckSubdomain(w, makeRequest(`{"subdomain":"this-name-must-not-resolve-1234567","poolDomain":"example.com"}`))
got := decodeResp(t, w.Body)
if !got.Available {
t.Errorf("BYO NXDOMAIN should be available, got %+v", got)
}
}
func TestCheckSubdomain_InvalidLabel(t *testing.T) {
t.Setenv("DYNADOT_MANAGED_DOMAINS", "omani.works")
pdm.ResetManagedDomains()
fake := &fakePDM{}
h := NewWithPDM(slog.Default(), fake)
w := httptest.NewRecorder()
h.CheckSubdomain(w, makeRequest(`{"subdomain":"-bad-","poolDomain":"omani.works"}`))
got := decodeResp(t, w.Body)
if got.Available {
t.Errorf("invalid label should be unavailable")
}
if got.Reason != "invalid-format" {
t.Errorf("Reason=%s want invalid-format", got.Reason)
}
if len(fake.checks) != 0 {
t.Errorf("invalid label must short-circuit before PDM is called")
}
}
func TestCheckSubdomain_PDMUnavailable(t *testing.T) {
t.Setenv("DYNADOT_MANAGED_DOMAINS", "omani.works")
pdm.ResetManagedDomains()
fake := &fakePDM{
check: func(ctx context.Context, pool, sub string) (*pdm.CheckResult, error) {
return nil, errors.New("connection refused")
},
}
h := NewWithPDM(slog.Default(), fake)
w := httptest.NewRecorder()
h.CheckSubdomain(w, makeRequest(`{"subdomain":"omantel","poolDomain":"omani.works"}`))
got := decodeResp(t, w.Body)
if got.Available {
t.Errorf("PDM unavailable must NOT be reported as Available=true")
}
if got.Reason != "pdm-unavailable" {
t.Errorf("Reason=%s want pdm-unavailable", got.Reason)
}
}

View File

@ -0,0 +1,295 @@
// Package pdm — HTTP client for pool-domain-manager.
//
// This package is the catalyst-api side of the contract introduced by #163.
// PDM owns every Dynadot write in the OpenOva fleet; catalyst-api never calls
// api.dynadot.com directly anymore. The wizard's pre-submit check, the
// reservation taken before `tofu apply`, the commit after the LB IP is known,
// and the release on `tofu destroy` all flow through this client.
//
// Per docs/INVIOLABLE-PRINCIPLES.md #4 the base URL is read from the
// POOL_DOMAIN_MANAGER_URL env var — defaulting to the in-cluster service
// FQDN so a stock catalyst-api deployment "just works" against the PDM
// running in openova-system. Tests/dev override the env var.
package pdm
import (
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
"net/url"
"os"
"strings"
"sync"
"time"
)
// Client is the catalyst-api → PDM HTTP client.
type Client struct {
BaseURL string
HTTP *http.Client
}
// New constructs a Client. baseURL must NOT have a trailing slash.
func New(baseURL string) *Client {
return &Client{
BaseURL: strings.TrimRight(baseURL, "/"),
HTTP: &http.Client{Timeout: 15 * time.Second},
}
}
// CheckResult mirrors PDM's response shape — kept loose so the wizard can
// surface PDM's reason/detail strings verbatim without an extra mapping.
type CheckResult struct {
Available bool `json:"available"`
Reason string `json:"reason,omitempty"`
Detail string `json:"detail,omitempty"`
FQDN string `json:"fqdn,omitempty"`
}
// Check calls GET /api/v1/pool/{domain}/check?sub=X.
func (c *Client) Check(ctx context.Context, poolDomain, subdomain string) (*CheckResult, error) {
u := fmt.Sprintf("%s/api/v1/pool/%s/check?sub=%s",
c.BaseURL, url.PathEscape(poolDomain), url.QueryEscape(subdomain))
req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil)
if err != nil {
return nil, fmt.Errorf("build request: %w", err)
}
resp, err := c.HTTP.Do(req)
if err != nil {
return nil, fmt.Errorf("pdm check: %w", err)
}
defer resp.Body.Close()
body, _ := io.ReadAll(resp.Body)
if resp.StatusCode >= 500 {
return nil, fmt.Errorf("pdm /check status %d: %s", resp.StatusCode, truncate(string(body), 256))
}
var out CheckResult
if err := json.Unmarshal(body, &out); err != nil {
return nil, fmt.Errorf("decode pdm check: %w (body=%s)", err, truncate(string(body), 256))
}
return &out, nil
}
// Reservation is the wire response of POST /reserve.
type Reservation struct {
PoolDomain string `json:"poolDomain"`
Subdomain string `json:"subdomain"`
State string `json:"state"`
ReservedAt string `json:"reservedAt"`
ExpiresAt string `json:"expiresAt"`
ReservationToken string `json:"reservationToken"`
CreatedBy string `json:"createdBy"`
}
// ErrConflict — PDM returned 409 Conflict (subdomain already taken).
var ErrConflict = errors.New("pool allocation conflict")
// ErrNotFound — PDM returned 404 (no row to commit/release).
var ErrNotFound = errors.New("pool allocation not found")
// Reserve calls POST /api/v1/pool/{domain}/reserve. Returns ErrConflict on
// 409 so callers can distinguish "name taken" from "PDM down".
func (c *Client) Reserve(ctx context.Context, poolDomain, subdomain, createdBy string) (*Reservation, error) {
body := map[string]string{
"subdomain": subdomain,
"createdBy": createdBy,
}
raw, err := json.Marshal(body)
if err != nil {
return nil, err
}
u := fmt.Sprintf("%s/api/v1/pool/%s/reserve", c.BaseURL, url.PathEscape(poolDomain))
req, err := http.NewRequestWithContext(ctx, http.MethodPost, u, bytes.NewReader(raw))
if err != nil {
return nil, err
}
req.Header.Set("Content-Type", "application/json")
resp, err := c.HTTP.Do(req)
if err != nil {
return nil, fmt.Errorf("pdm reserve: %w", err)
}
defer resp.Body.Close()
respBody, _ := io.ReadAll(resp.Body)
switch resp.StatusCode {
case http.StatusCreated:
var out Reservation
if err := json.Unmarshal(respBody, &out); err != nil {
return nil, fmt.Errorf("decode reserve: %w (body=%s)", err, truncate(string(respBody), 256))
}
return &out, nil
case http.StatusConflict:
return nil, ErrConflict
default:
return nil, fmt.Errorf("pdm reserve status %d: %s", resp.StatusCode, truncate(string(respBody), 256))
}
}
// CommitInput maps to PDM's commit body shape.
type CommitInput struct {
Subdomain string
ReservationToken string
SovereignFQDN string
LoadBalancerIP string
}
// Commit calls POST /api/v1/pool/{domain}/commit.
func (c *Client) Commit(ctx context.Context, poolDomain string, in CommitInput) error {
body := map[string]string{
"subdomain": in.Subdomain,
"reservationToken": in.ReservationToken,
"sovereignFQDN": in.SovereignFQDN,
"loadBalancerIP": in.LoadBalancerIP,
}
raw, err := json.Marshal(body)
if err != nil {
return err
}
u := fmt.Sprintf("%s/api/v1/pool/%s/commit", c.BaseURL, url.PathEscape(poolDomain))
req, err := http.NewRequestWithContext(ctx, http.MethodPost, u, bytes.NewReader(raw))
if err != nil {
return err
}
req.Header.Set("Content-Type", "application/json")
resp, err := c.HTTP.Do(req)
if err != nil {
return fmt.Errorf("pdm commit: %w", err)
}
defer resp.Body.Close()
respBody, _ := io.ReadAll(resp.Body)
switch resp.StatusCode {
case http.StatusOK, http.StatusAccepted:
return nil
case http.StatusNotFound:
return ErrNotFound
default:
return fmt.Errorf("pdm commit status %d: %s", resp.StatusCode, truncate(string(respBody), 256))
}
}
// Release calls DELETE /api/v1/pool/{domain}/release.
func (c *Client) Release(ctx context.Context, poolDomain, subdomain string) error {
body := map[string]string{"subdomain": subdomain}
raw, err := json.Marshal(body)
if err != nil {
return err
}
u := fmt.Sprintf("%s/api/v1/pool/%s/release", c.BaseURL, url.PathEscape(poolDomain))
req, err := http.NewRequestWithContext(ctx, http.MethodDelete, u, bytes.NewReader(raw))
if err != nil {
return err
}
req.Header.Set("Content-Type", "application/json")
resp, err := c.HTTP.Do(req)
if err != nil {
return fmt.Errorf("pdm release: %w", err)
}
defer resp.Body.Close()
respBody, _ := io.ReadAll(resp.Body)
switch resp.StatusCode {
case http.StatusOK, http.StatusAccepted:
return nil
case http.StatusNotFound:
return ErrNotFound
default:
return fmt.Errorf("pdm release status %d: %s", resp.StatusCode, truncate(string(respBody), 256))
}
}
func truncate(s string, max int) string {
if len(s) <= max {
return s
}
return s[:max] + "..."
}
// ── Managed-pool resolution ─────────────────────────────────────────────
//
// catalyst-api needs to know which pool domains PDM owns (so it knows when
// to delegate to PDM vs. fall back to the BYO/DNS path). PDM exposes the
// list at /healthz, but caching that on every wizard keystroke is wasteful.
// Instead — per docs/INVIOLABLE-PRINCIPLES.md #4 — we read the same
// DYNADOT_MANAGED_DOMAINS env var that the K8s ExternalSecret projects into
// the PDM Pod, and that the same secret can project into the catalyst-api
// Pod for this purpose. The env var value is the contract; PDM is the writer.
var managedDomainsState struct {
once sync.Once
set map[string]struct{}
}
// IsManagedDomain reports whether the given domain is in the runtime
// DYNADOT_MANAGED_DOMAINS list. catalyst-api uses this to route /check
// requests: managed → PDM, BYO → DNS lookup.
//
// Resolution order mirrors the legacy dynadot package's so a deployment
// migrating to PDM keeps working without secret edits:
// 1. DYNADOT_MANAGED_DOMAINS env var (canonical)
// 2. DYNADOT_DOMAIN single-value fallback
// 3. Built-in defaults: openova.io, omani.works
func IsManagedDomain(domain string) bool {
d := strings.ToLower(strings.TrimSpace(domain))
if d == "" {
return false
}
managedDomainsState.once.Do(func() {
managedDomainsState.set = computeManagedDomains()
})
_, ok := managedDomainsState.set[d]
return ok
}
// ResetManagedDomains clears the cache so tests can re-evaluate after
// mutating env vars.
func ResetManagedDomains() {
managedDomainsState.once = sync.Once{}
managedDomainsState.set = nil
}
// ManagedDomains returns a sorted, deduplicated copy of the configured
// managed-domain list.
func ManagedDomains() []string {
managedDomainsState.once.Do(func() {
managedDomainsState.set = computeManagedDomains()
})
out := make([]string, 0, len(managedDomainsState.set))
for d := range managedDomainsState.set {
out = append(out, d)
}
for i := 1; i < len(out); i++ {
for j := i; j > 0 && out[j-1] > out[j]; j-- {
out[j-1], out[j] = out[j], out[j-1]
}
}
return out
}
func computeManagedDomains() map[string]struct{} {
out := make(map[string]struct{})
if raw := os.Getenv("DYNADOT_MANAGED_DOMAINS"); strings.TrimSpace(raw) != "" {
out = splitDomainsList(raw)
if len(out) > 0 {
return out
}
}
if d := strings.ToLower(strings.TrimSpace(os.Getenv("DYNADOT_DOMAIN"))); d != "" {
out[d] = struct{}{}
return out
}
out["openova.io"] = struct{}{}
out["omani.works"] = struct{}{}
return out
}
func splitDomainsList(raw string) map[string]struct{} {
raw = strings.ToLower(raw)
raw = strings.ReplaceAll(raw, ",", " ")
out := make(map[string]struct{})
for _, p := range strings.Fields(raw) {
out[p] = struct{}{}
}
return out
}