fix(catalyst-api): release PDM subdomain on Pod-restart orphan + add explicit release endpoint (closes #489) (#502)
* fix(catalyst-api): release PDM subdomain on Pod-restart orphan + add explicit release endpoint
Each failed provision permanently consumed its pool subdomain in PDM —
otech, otech1..otech9 stayed locked because two release seams were
missing:
1. Pod-restart orphan: when catalyst-api dies mid-provisioning, the
runProvisioning goroutine that would have called pdm.Release on
Phase-0 failure dies with the Pod. fromRecord rewrites the
rehydrated status to "failed" but nothing reaps the still-active
reservation. restoreFromStore now fires a best-effort
pdm.Release for every record it rewrites from in-flight to failed,
gated on AdoptedAt==nil so customer-owned Sovereigns are protected.
2. Abandoned-deployment retries: the only operator-driven release path
was Cancel & Wipe, which requires re-entering the HetznerToken.
Franchise customers retrying under the same subdomain after a
botched provision shouldn't need a Hetzner credential roundtrip
for a PDM-only fix. New endpoint
DELETE /api/v1/deployments/{id}/release-subdomain releases the
PDM allocation only — no Hetzner work, no record deletion. Refuses
in-flight (409), wiped (410), and adopted (422) deployments.
Tests cover: failed-deployment release, idempotent ErrNotFound, in-flight
refusal across all in-flight statuses, adopted protection, BYO no-op,
404 on unknown id, 502 on PDM transient, Pod-restart orphan release on
restoreFromStore, and the negative-path proof that a clean-failed record
on disk does NOT trigger a duplicate Release at restart.
Closes #489
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* test(catalyst-api): fix data race in fakePDM around orphan-release goroutine
The Pod-restart orphan-release path (issue #489) fires pdm.Release in a
goroutine spawned by restoreFromStore. The race detector flagged the
test's read of fpdm.releases against the goroutine's append. Adding a
sync.Mutex to fakePDM + a snapshotReleases() accessor closes the race
without changing the surface that 30+ other tests already use.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---------
Co-authored-by: hatiyildiz <hatiyildiz@noreply.github.com>
Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
b8c639127a
commit
c148ef36ff
@ -151,6 +151,15 @@ func main() {
|
||||
// PurgeReport summary. The wizard's failed-state banner renders the
|
||||
// operator confirmation modal that POSTs here.
|
||||
r.Post("/api/v1/deployments/{id}/wipe", h.WipeDeployment)
|
||||
// Subdomain-only release endpoint (issue #489). Releases the PDM
|
||||
// allocation row for a failed-or-abandoned deployment WITHOUT
|
||||
// requiring the operator to re-enter their HetznerToken. Lets a
|
||||
// franchise customer retry under the same pool subdomain after a
|
||||
// botched provision instead of being forced to pick acmeN+1. Does
|
||||
// NOT touch Hetzner — the Cancel & Wipe flow remains the canonical
|
||||
// path for live cloud cleanup. Refuses on in-flight deployments
|
||||
// (409), wiped deployments (410), or adopted Sovereigns (422).
|
||||
r.Delete("/api/v1/deployments/{id}/release-subdomain", h.ReleaseSubdomain)
|
||||
// Handover finalisation (issue #317). Catalyst-Zero side: stops the
|
||||
// helmwatch informer, ships the OpenTofu state to the new Sovereign's
|
||||
// catalyst-api, and purges every local trace once the new side
|
||||
|
||||
@ -468,6 +468,27 @@ func (h *Handler) restoreFromStore() {
|
||||
"err", err,
|
||||
)
|
||||
}
|
||||
// Pod-restart-orphan PDM release (issue #489). The previous
|
||||
// catalyst-api Pod owned the runProvisioning goroutine that
|
||||
// would have called pdm.Release on Phase-0 failure. The
|
||||
// goroutine died with the Pod; the reservation is now
|
||||
// orphaned in PDM, locking the subdomain forever. fromRecord
|
||||
// rewrites the status to "failed" precisely BECAUSE the
|
||||
// previous Pod was killed mid-apply — the same condition
|
||||
// means we MUST release the PDM slot here, otherwise a
|
||||
// retry under the same subdomain returns 409 conflict and
|
||||
// the franchise customer is forced to pick `acmeN+1` for
|
||||
// what should be a routine retry.
|
||||
//
|
||||
// Best-effort + asynchronous: a slow PDM at startup must
|
||||
// NOT block the rest of the rehydration loop (other
|
||||
// deployments need to land in sync.Map quickly so /events
|
||||
// poll works).
|
||||
if dep.Status == "failed" && rec.PDMReservationToken != "" &&
|
||||
rec.PDMPoolDomain != "" && rec.PDMSubdomain != "" &&
|
||||
dep.AdoptedAt == nil && h.pdm != nil {
|
||||
go h.releaseOrphanedReservation(dep.ID, rec.PDMPoolDomain, rec.PDMSubdomain)
|
||||
}
|
||||
}
|
||||
|
||||
// Resume the Phase-1 helmwatch goroutine after a Pod restart
|
||||
@ -498,6 +519,49 @@ func (h *Handler) restoreFromStore() {
|
||||
)
|
||||
}
|
||||
|
||||
// releaseOrphanedReservation calls pdm.Release for a deployment whose
|
||||
// status was rewritten to "failed" because the catalyst-api Pod died
|
||||
// mid-provisioning (issue #489). Best-effort: any failure is logged
|
||||
// at warn but does not block other rehydration work. Idempotent — a
|
||||
// pdm.ErrNotFound response (the reservation already expired or was
|
||||
// previously released) is treated as success. The dep entry's PDM
|
||||
// pointers are cleared on success so a follow-up Cancel & Wipe doesn't
|
||||
// double-release.
|
||||
//
|
||||
// Run as a goroutine from restoreFromStore: the rehydration loop must
|
||||
// not block on PDM. A 30s timeout caps each call.
|
||||
func (h *Handler) releaseOrphanedReservation(deploymentID, poolDomain, subdomain string) {
|
||||
if h.pdm == nil || poolDomain == "" || subdomain == "" {
|
||||
return
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
err := h.pdm.Release(ctx, poolDomain, subdomain)
|
||||
if err != nil && !errors.Is(err, pdm.ErrNotFound) {
|
||||
h.log.Warn("orphan-release: pdm release failed; subdomain may stay locked until TTL",
|
||||
"id", deploymentID,
|
||||
"poolDomain", poolDomain,
|
||||
"subdomain", subdomain,
|
||||
"err", err,
|
||||
)
|
||||
return
|
||||
}
|
||||
h.log.Info("orphan-release: released PDM allocation for pod-restart-orphaned deployment",
|
||||
"id", deploymentID,
|
||||
"poolDomain", poolDomain,
|
||||
"subdomain", subdomain,
|
||||
)
|
||||
if val, ok := h.deployments.Load(deploymentID); ok {
|
||||
dep := val.(*Deployment)
|
||||
dep.mu.Lock()
|
||||
dep.pdmPoolDomain = ""
|
||||
dep.pdmSubdomain = ""
|
||||
dep.pdmReservationToken = ""
|
||||
dep.mu.Unlock()
|
||||
h.persistDeployment(dep)
|
||||
}
|
||||
}
|
||||
|
||||
// shouldResumePhase1 returns true when a rehydrated deployment is a
|
||||
// candidate for re-attaching the Phase-1 helmwatch goroutine. The
|
||||
// criteria are:
|
||||
|
||||
@ -0,0 +1,390 @@
|
||||
// Tests for the subdomain-release path introduced by issue #489.
|
||||
//
|
||||
// Two independent contracts:
|
||||
//
|
||||
// 1. DELETE /api/v1/deployments/{id}/release-subdomain releases the PDM
|
||||
// allocation row for a failed-or-abandoned deployment without
|
||||
// requiring the operator's HetznerToken. This is the franchise-retry
|
||||
// ergonomic — botched provisions must not permanently lock a pool
|
||||
// subdomain.
|
||||
//
|
||||
// 2. restoreFromStore fires a best-effort PDM Release for any deployment
|
||||
// it rewrites from "in-flight" to "failed" because of a Pod restart.
|
||||
// The runProvisioning goroutine that would have called Release died
|
||||
// with the previous Pod; the rehydration path is the next best seam.
|
||||
package handler
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/go-chi/chi/v5"
|
||||
|
||||
"github.com/openova-io/openova/products/catalyst/bootstrap/api/internal/pdm"
|
||||
"github.com/openova-io/openova/products/catalyst/bootstrap/api/internal/provisioner"
|
||||
"github.com/openova-io/openova/products/catalyst/bootstrap/api/internal/store"
|
||||
)
|
||||
|
||||
// callReleaseSubdomain wires a chi router so chi.URLParam("id") resolves
|
||||
// to the supplied id and returns the recorded response.
|
||||
func callReleaseSubdomain(h *Handler, id string) *httptest.ResponseRecorder {
|
||||
r := chi.NewRouter()
|
||||
r.Delete("/api/v1/deployments/{id}/release-subdomain", h.ReleaseSubdomain)
|
||||
w := httptest.NewRecorder()
|
||||
req := httptest.NewRequest(http.MethodDelete, "/api/v1/deployments/"+id+"/release-subdomain", nil)
|
||||
r.ServeHTTP(w, req)
|
||||
return w
|
||||
}
|
||||
|
||||
// TestReleaseSubdomain_FailedDeploymentReleasesPDMSlot is the headline
|
||||
// invariant: a failed deployment with a still-locked PDM allocation gets
|
||||
// its slot released so the next attempt under the same subdomain
|
||||
// succeeds.
|
||||
func TestReleaseSubdomain_FailedDeploymentReleasesPDMSlot(t *testing.T) {
|
||||
fpdm := &fakePDM{}
|
||||
h := NewWithPDM(silentLogger(), fpdm)
|
||||
|
||||
dep := &Deployment{
|
||||
ID: "dep-failed-1",
|
||||
Status: "failed",
|
||||
Request: provisioner.Request{SovereignFQDN: "otech5.omani.works", SovereignDomainMode: "pool"},
|
||||
pdmReservationToken: "rt-otech5",
|
||||
pdmPoolDomain: "omani.works",
|
||||
pdmSubdomain: "otech5",
|
||||
}
|
||||
h.deployments.Store(dep.ID, dep)
|
||||
|
||||
w := callReleaseSubdomain(h, dep.ID)
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d body=%s", w.Code, w.Body.String())
|
||||
}
|
||||
|
||||
var resp releaseSubdomainResponse
|
||||
if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
|
||||
t.Fatalf("decode: %v", err)
|
||||
}
|
||||
if !resp.PDMReleased {
|
||||
t.Errorf("PDMReleased=false, want true (body=%s)", w.Body.String())
|
||||
}
|
||||
if resp.PoolDomain != "omani.works" || resp.Subdomain != "otech5" {
|
||||
t.Errorf("response pool/sub = %q/%q, want omani.works/otech5", resp.PoolDomain, resp.Subdomain)
|
||||
}
|
||||
|
||||
// PDM client received exactly one Release call with the correct args.
|
||||
if got := len(fpdm.releases); got != 1 {
|
||||
t.Fatalf("expected 1 PDM Release call, got %d (%+v)", got, fpdm.releases)
|
||||
}
|
||||
if fpdm.releases[0].pool != "omani.works" || fpdm.releases[0].sub != "otech5" {
|
||||
t.Errorf("Release args = %+v, want omani.works/otech5", fpdm.releases[0])
|
||||
}
|
||||
|
||||
// Deployment's PDM pointers were cleared so a follow-up Cancel & Wipe
|
||||
// doesn't try to release a slot we just released.
|
||||
if dep.pdmPoolDomain != "" || dep.pdmSubdomain != "" || dep.pdmReservationToken != "" {
|
||||
t.Errorf("PDM pointers not cleared: pool=%q sub=%q token=%q",
|
||||
dep.pdmPoolDomain, dep.pdmSubdomain, dep.pdmReservationToken)
|
||||
}
|
||||
}
|
||||
|
||||
// TestReleaseSubdomain_PDMNotFoundIsIdempotent — calling release twice
|
||||
// must succeed the second time. PDM returns 404/ErrNotFound when the
|
||||
// allocation has already been released; we treat that as success.
|
||||
func TestReleaseSubdomain_PDMNotFoundIsIdempotent(t *testing.T) {
|
||||
fpdm := &fakePDM{
|
||||
release: func(ctx context.Context, pool, sub string) error {
|
||||
return pdm.ErrNotFound
|
||||
},
|
||||
}
|
||||
h := NewWithPDM(silentLogger(), fpdm)
|
||||
|
||||
dep := &Deployment{
|
||||
ID: "dep-not-found",
|
||||
Status: "failed",
|
||||
Request: provisioner.Request{SovereignFQDN: "otech.omani.works", SovereignDomainMode: "pool"},
|
||||
pdmPoolDomain: "omani.works",
|
||||
pdmSubdomain: "otech",
|
||||
}
|
||||
h.deployments.Store(dep.ID, dep)
|
||||
|
||||
w := callReleaseSubdomain(h, dep.ID)
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d body=%s — pdm.ErrNotFound must be treated as success", w.Code, w.Body.String())
|
||||
}
|
||||
var resp releaseSubdomainResponse
|
||||
if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
|
||||
t.Fatalf("decode: %v", err)
|
||||
}
|
||||
if !resp.PDMReleased {
|
||||
t.Errorf("PDMReleased=false on ErrNotFound; idempotent contract requires true")
|
||||
}
|
||||
}
|
||||
|
||||
// TestReleaseSubdomain_InFlightDeploymentRefuses prevents stripping a
|
||||
// reservation out from under a runProvisioning goroutine that may yet
|
||||
// call Commit.
|
||||
func TestReleaseSubdomain_InFlightDeploymentRefuses(t *testing.T) {
|
||||
fpdm := &fakePDM{}
|
||||
h := NewWithPDM(silentLogger(), fpdm)
|
||||
|
||||
for _, status := range []string{"pending", "provisioning", "tofu-applying", "phase1-watching"} {
|
||||
t.Run(status, func(t *testing.T) {
|
||||
fpdm.releases = nil
|
||||
dep := &Deployment{
|
||||
ID: "dep-" + status,
|
||||
Status: status,
|
||||
Request: provisioner.Request{SovereignFQDN: "otech.omani.works", SovereignDomainMode: "pool"},
|
||||
pdmPoolDomain: "omani.works",
|
||||
pdmSubdomain: "otech",
|
||||
}
|
||||
h.deployments.Store(dep.ID, dep)
|
||||
|
||||
w := callReleaseSubdomain(h, dep.ID)
|
||||
|
||||
if w.Code != http.StatusConflict {
|
||||
t.Errorf("status=%d, want 409 (in-flight) — body=%s", w.Code, w.Body.String())
|
||||
}
|
||||
if got := len(fpdm.releases); got != 0 {
|
||||
t.Errorf("PDM Release was called for in-flight deployment (%d times)", got)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestReleaseSubdomain_AdoptedDeploymentRefuses protects customer-owned
|
||||
// Sovereigns from accidental subdomain release. The Sovereign console
|
||||
// is reachable on the committed subdomain; releasing would 404 every
|
||||
// customer request until the operator runs a re-commit.
|
||||
func TestReleaseSubdomain_AdoptedDeploymentRefuses(t *testing.T) {
|
||||
fpdm := &fakePDM{}
|
||||
h := NewWithPDM(silentLogger(), fpdm)
|
||||
|
||||
now := time.Now().UTC()
|
||||
dep := &Deployment{
|
||||
ID: "dep-adopted",
|
||||
Status: "ready",
|
||||
Request: provisioner.Request{SovereignFQDN: "acme.omani.works", SovereignDomainMode: "pool"},
|
||||
pdmPoolDomain: "omani.works",
|
||||
pdmSubdomain: "acme",
|
||||
AdoptedAt: &now,
|
||||
}
|
||||
h.deployments.Store(dep.ID, dep)
|
||||
|
||||
w := callReleaseSubdomain(h, dep.ID)
|
||||
|
||||
if w.Code != http.StatusUnprocessableEntity {
|
||||
t.Errorf("status=%d, want 422 (adopted) — body=%s", w.Code, w.Body.String())
|
||||
}
|
||||
if got := len(fpdm.releases); got != 0 {
|
||||
t.Errorf("PDM Release fired for adopted deployment (%d times)", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestReleaseSubdomain_BYODeploymentNoOp — BYO domains never have a PDM
|
||||
// allocation. Calling release should return 200 with a noOp marker so
|
||||
// wizard UI flows can call this unconditionally.
|
||||
func TestReleaseSubdomain_BYODeploymentNoOp(t *testing.T) {
|
||||
fpdm := &fakePDM{}
|
||||
h := NewWithPDM(silentLogger(), fpdm)
|
||||
|
||||
dep := &Deployment{
|
||||
ID: "dep-byo",
|
||||
Status: "failed",
|
||||
Request: provisioner.Request{SovereignFQDN: "k8s.customer.example", SovereignDomainMode: "byo"},
|
||||
}
|
||||
h.deployments.Store(dep.ID, dep)
|
||||
|
||||
w := callReleaseSubdomain(h, dep.ID)
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d body=%s", w.Code, w.Body.String())
|
||||
}
|
||||
var resp releaseSubdomainResponse
|
||||
if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
|
||||
t.Fatalf("decode: %v", err)
|
||||
}
|
||||
if resp.PDMReleased {
|
||||
t.Errorf("PDMReleased=true on BYO; want false (no allocation to release)")
|
||||
}
|
||||
if resp.NoOp == "" {
|
||||
t.Errorf("NoOp empty on BYO; want explanatory marker")
|
||||
}
|
||||
if got := len(fpdm.releases); got != 0 {
|
||||
t.Errorf("PDM Release fired on BYO deployment (%d times)", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestReleaseSubdomain_UnknownDeploymentIs404
|
||||
func TestReleaseSubdomain_UnknownDeploymentIs404(t *testing.T) {
|
||||
h := NewWithPDM(silentLogger(), &fakePDM{})
|
||||
w := callReleaseSubdomain(h, "no-such-deployment")
|
||||
if w.Code != http.StatusNotFound {
|
||||
t.Errorf("status=%d, want 404 — body=%s", w.Code, w.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
// TestReleaseSubdomain_PDMTransientFailureSurfaces502 — when PDM is up
|
||||
// but returns an error other than NotFound (e.g. 500 or network blip),
|
||||
// surface 502 so the operator knows to retry.
|
||||
func TestReleaseSubdomain_PDMTransientFailureSurfaces502(t *testing.T) {
|
||||
fpdm := &fakePDM{
|
||||
release: func(ctx context.Context, pool, sub string) error {
|
||||
return errors.New("connection refused")
|
||||
},
|
||||
}
|
||||
h := NewWithPDM(silentLogger(), fpdm)
|
||||
|
||||
dep := &Deployment{
|
||||
ID: "dep-transient",
|
||||
Status: "failed",
|
||||
Request: provisioner.Request{SovereignFQDN: "otech.omani.works", SovereignDomainMode: "pool"},
|
||||
pdmPoolDomain: "omani.works",
|
||||
pdmSubdomain: "otech",
|
||||
}
|
||||
h.deployments.Store(dep.ID, dep)
|
||||
|
||||
w := callReleaseSubdomain(h, dep.ID)
|
||||
|
||||
if w.Code != http.StatusBadGateway {
|
||||
t.Errorf("status=%d, want 502 — body=%s", w.Code, w.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
// TestRestoreFromStore_PodRestartOrphanReleasesPDMSlot covers the
|
||||
// rehydration path: when a Pod dies mid-provisioning, fromRecord rewrites
|
||||
// the on-disk status to "failed". restoreFromStore must, in that exact
|
||||
// case, fire pdm.Release for the orphaned reservation. Without this,
|
||||
// the "12-otech" symptom from issue #489 reproduces — every Pod restart
|
||||
// during provisioning permanently locks a subdomain.
|
||||
func TestRestoreFromStore_PodRestartOrphanReleasesPDMSlot(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
st, err := store.New(dir)
|
||||
if err != nil {
|
||||
t.Fatalf("store.New: %v", err)
|
||||
}
|
||||
|
||||
// Seed an on-disk record that looks like a Pod was killed during
|
||||
// `tofu apply`: status="tofu-applying" (in-flight), with a PDM
|
||||
// reservation already taken.
|
||||
rec := store.Record{
|
||||
ID: "pod-killed-deployment",
|
||||
Status: "tofu-applying", // in-flight at kill time
|
||||
PDMReservationToken: "rt-orphan",
|
||||
PDMPoolDomain: "omani.works",
|
||||
PDMSubdomain: "otech7",
|
||||
Request: store.RedactedRequest{
|
||||
SovereignFQDN: "otech7.omani.works",
|
||||
SovereignDomainMode: "pool",
|
||||
},
|
||||
StartedAt: time.Now().UTC(),
|
||||
}
|
||||
if err := st.Save(rec); err != nil {
|
||||
t.Fatalf("seed record: %v", err)
|
||||
}
|
||||
|
||||
// New PDM that records every Release call.
|
||||
fpdm := &fakePDM{}
|
||||
|
||||
// NewWithStore runs restoreFromStore; the rewrite-to-failed branch
|
||||
// fires releaseOrphanedReservation as a goroutine.
|
||||
h := NewWithStore(silentLogger(), fpdm, st)
|
||||
|
||||
// Wait briefly for the async release to fire — capped at 2s. Use
|
||||
// the mutex-protected snapshot accessor; a direct read of
|
||||
// fpdm.releases here races the goroutine's append (race detector
|
||||
// flagged it on first CI run before the accessor was added).
|
||||
deadline := time.Now().Add(2 * time.Second)
|
||||
var releases []releaseCall
|
||||
for time.Now().Before(deadline) {
|
||||
releases = fpdm.snapshotReleases()
|
||||
if len(releases) > 0 {
|
||||
break
|
||||
}
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
}
|
||||
|
||||
if got := len(releases); got != 1 {
|
||||
t.Fatalf("expected 1 orphan PDM Release; got %d (%+v)", got, releases)
|
||||
}
|
||||
if releases[0].pool != "omani.works" || releases[0].sub != "otech7" {
|
||||
t.Errorf("orphan Release args = %+v, want omani.works/otech7", releases[0])
|
||||
}
|
||||
|
||||
// Deployment landed in sync.Map with status rewritten to "failed".
|
||||
val, ok := h.deployments.Load("pod-killed-deployment")
|
||||
if !ok {
|
||||
t.Fatalf("deployment not in sync.Map after restore")
|
||||
}
|
||||
dep := val.(*Deployment)
|
||||
if dep.Status != "failed" {
|
||||
t.Errorf("Status=%q, want failed (Pod-restart rewrite contract)", dep.Status)
|
||||
}
|
||||
|
||||
// Wait for the success-path field clear (the goroutine clears the
|
||||
// pointers + persists after a successful release).
|
||||
deadline = time.Now().Add(2 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
dep.mu.Lock()
|
||||
cleared := dep.pdmPoolDomain == "" && dep.pdmSubdomain == ""
|
||||
dep.mu.Unlock()
|
||||
if cleared {
|
||||
break
|
||||
}
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
}
|
||||
dep.mu.Lock()
|
||||
defer dep.mu.Unlock()
|
||||
if dep.pdmPoolDomain != "" || dep.pdmSubdomain != "" {
|
||||
t.Errorf("PDM pointers not cleared after orphan-release: pool=%q sub=%q",
|
||||
dep.pdmPoolDomain, dep.pdmSubdomain)
|
||||
}
|
||||
}
|
||||
|
||||
// TestRestoreFromStore_TerminalRecordDoesNotReleasePDM proves we DON'T
|
||||
// re-release a deployment that finished cleanly (status="failed" was
|
||||
// already the terminal state on disk, not a rewrite). A clean failure
|
||||
// already had its release path run by the original Pod's
|
||||
// runProvisioning goroutine; firing a second Release at restart time
|
||||
// would be a noisy duplicate.
|
||||
func TestRestoreFromStore_TerminalRecordDoesNotReleasePDM(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
st, err := store.New(dir)
|
||||
if err != nil {
|
||||
t.Fatalf("store.New: %v", err)
|
||||
}
|
||||
|
||||
// Record was already "failed" on disk — NOT an in-flight kill.
|
||||
rec := store.Record{
|
||||
ID: "clean-fail",
|
||||
Status: "failed",
|
||||
PDMReservationToken: "rt-clean",
|
||||
PDMPoolDomain: "omani.works",
|
||||
PDMSubdomain: "otech-clean",
|
||||
Request: store.RedactedRequest{
|
||||
SovereignFQDN: "otech-clean.omani.works",
|
||||
SovereignDomainMode: "pool",
|
||||
},
|
||||
StartedAt: time.Now().UTC(),
|
||||
}
|
||||
if err := st.Save(rec); err != nil {
|
||||
t.Fatalf("seed record: %v", err)
|
||||
}
|
||||
|
||||
fpdm := &fakePDM{}
|
||||
_ = NewWithStore(silentLogger(), fpdm, st)
|
||||
|
||||
// Allow a generous window for any spurious goroutine to fire.
|
||||
time.Sleep(150 * time.Millisecond)
|
||||
|
||||
releases := fpdm.snapshotReleases()
|
||||
if got := len(releases); got != 0 {
|
||||
t.Errorf("expected 0 PDM Release calls for already-terminal record, got %d (%+v)",
|
||||
got, releases)
|
||||
}
|
||||
}
|
||||
@ -23,6 +23,7 @@ import (
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"sync"
|
||||
"testing"
|
||||
|
||||
"github.com/openova-io/openova/products/catalyst/bootstrap/api/internal/pdm"
|
||||
@ -30,7 +31,13 @@ import (
|
||||
|
||||
// fakePDM is a stub pdmClient that records every call. We assert against the
|
||||
// recorded calls to prove the behaviour the architecture requires.
|
||||
//
|
||||
// mu guards the recorded-call slices because the orphan-release path
|
||||
// introduced by issue #489 fires pdm.Release in a goroutine spawned by
|
||||
// restoreFromStore. Without the mutex, -race flags the slice append in
|
||||
// Release racing the test's read.
|
||||
type fakePDM struct {
|
||||
mu sync.Mutex
|
||||
checks []checkCall
|
||||
check func(ctx context.Context, pool, sub string) (*pdm.CheckResult, error)
|
||||
reserves []reserveCall
|
||||
@ -41,22 +48,40 @@ type fakePDM struct {
|
||||
release func(ctx context.Context, pool, sub string) error
|
||||
}
|
||||
|
||||
// snapshotReleases returns a copy of the recorded Release calls under
|
||||
// the mutex. Tests that race goroutines against the recorder MUST use
|
||||
// this instead of reading `f.releases` directly, otherwise -race flags
|
||||
// the read.
|
||||
func (f *fakePDM) snapshotReleases() []releaseCall {
|
||||
f.mu.Lock()
|
||||
defer f.mu.Unlock()
|
||||
out := make([]releaseCall, len(f.releases))
|
||||
copy(out, f.releases)
|
||||
return out
|
||||
}
|
||||
|
||||
type checkCall struct{ pool, sub string }
|
||||
type reserveCall struct{ pool, sub, by string }
|
||||
type releaseCall struct{ pool, sub string }
|
||||
|
||||
func (f *fakePDM) Check(ctx context.Context, pool, sub string) (*pdm.CheckResult, error) {
|
||||
f.mu.Lock()
|
||||
f.checks = append(f.checks, checkCall{pool, sub})
|
||||
if f.check != nil {
|
||||
return f.check(ctx, pool, sub)
|
||||
cb := f.check
|
||||
f.mu.Unlock()
|
||||
if cb != nil {
|
||||
return cb(ctx, pool, sub)
|
||||
}
|
||||
return &pdm.CheckResult{Available: true, FQDN: sub + "." + pool}, nil
|
||||
}
|
||||
|
||||
func (f *fakePDM) Reserve(ctx context.Context, pool, sub, by string) (*pdm.Reservation, error) {
|
||||
f.mu.Lock()
|
||||
f.reserves = append(f.reserves, reserveCall{pool, sub, by})
|
||||
if f.reserve != nil {
|
||||
return f.reserve(ctx, pool, sub, by)
|
||||
cb := f.reserve
|
||||
f.mu.Unlock()
|
||||
if cb != nil {
|
||||
return cb(ctx, pool, sub, by)
|
||||
}
|
||||
return &pdm.Reservation{
|
||||
PoolDomain: pool, Subdomain: sub, State: "reserved",
|
||||
@ -65,17 +90,23 @@ func (f *fakePDM) Reserve(ctx context.Context, pool, sub, by string) (*pdm.Reser
|
||||
}
|
||||
|
||||
func (f *fakePDM) Commit(ctx context.Context, pool string, in pdm.CommitInput) error {
|
||||
f.mu.Lock()
|
||||
f.commits = append(f.commits, in)
|
||||
if f.commit != nil {
|
||||
return f.commit(ctx, pool, in)
|
||||
cb := f.commit
|
||||
f.mu.Unlock()
|
||||
if cb != nil {
|
||||
return cb(ctx, pool, in)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (f *fakePDM) Release(ctx context.Context, pool, sub string) error {
|
||||
f.mu.Lock()
|
||||
f.releases = append(f.releases, releaseCall{pool, sub})
|
||||
if f.release != nil {
|
||||
return f.release(ctx, pool, sub)
|
||||
cb := f.release
|
||||
f.mu.Unlock()
|
||||
if cb != nil {
|
||||
return cb(ctx, pool, sub)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@ -46,9 +46,15 @@ import (
|
||||
"github.com/go-chi/chi/v5"
|
||||
|
||||
"github.com/openova-io/openova/products/catalyst/bootstrap/api/internal/hetzner"
|
||||
"github.com/openova-io/openova/products/catalyst/bootstrap/api/internal/pdm"
|
||||
"github.com/openova-io/openova/products/catalyst/bootstrap/api/internal/provisioner"
|
||||
)
|
||||
|
||||
// pdmErrNotFound returns the sentinel value to compare against PDM Release
|
||||
// outcomes. Wrapped in a function so callers can use errors.Is without
|
||||
// importing the pdm package at every call site.
|
||||
func pdmErrNotFound() error { return pdm.ErrNotFound }
|
||||
|
||||
// wipeRequest is the body of POST /api/v1/deployments/{id}/wipe.
|
||||
//
|
||||
// HetznerToken is required ONLY when the on-disk Deployment record's
|
||||
@ -294,6 +300,184 @@ func deploymentSovereignName(fqdn string) string {
|
||||
return strings.ReplaceAll(fqdn, ".", "-")
|
||||
}
|
||||
|
||||
// releaseSubdomainResponse is the wire shape of DELETE
|
||||
// /api/v1/deployments/{id}/release-subdomain — a subdomain-only release
|
||||
// path that does NOT require the HetznerToken (issue #489). The full
|
||||
// Cancel & Wipe flow remains the canonical purge for live Hetzner
|
||||
// resources; this endpoint is the narrower fix for the case where a
|
||||
// failed-or-abandoned deployment locks a pool subdomain that an
|
||||
// operator wants to retry under the SAME name.
|
||||
type releaseSubdomainResponse struct {
|
||||
DeploymentID string `json:"deploymentId"`
|
||||
PoolDomain string `json:"poolDomain"`
|
||||
Subdomain string `json:"subdomain"`
|
||||
PDMReleased bool `json:"pdmReleased"`
|
||||
NoOp string `json:"noOp,omitempty"`
|
||||
Error string `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
// ReleaseSubdomain handles DELETE /api/v1/deployments/{id}/release-subdomain.
|
||||
//
|
||||
// Issue #489 — each failed provision permanently consumed its subdomain
|
||||
// because the only release seams were (a) runProvisioning's post-Phase-0
|
||||
// failure path, dead after a Pod restart, and (b) the Cancel & Wipe flow
|
||||
// which requires the operator to re-enter the HetznerToken. For franchise
|
||||
// customers a retry of `acme.omani.works` after a failed `acme.omani.works`
|
||||
// must NOT need a new subdomain or a HetznerToken roundtrip — the slot
|
||||
// belongs to the customer, not to the failed attempt.
|
||||
//
|
||||
// This endpoint:
|
||||
//
|
||||
// - Looks up the deployment by id.
|
||||
// - Refuses to release a deployment that is still in-flight (operator
|
||||
// must wait or wipe properly).
|
||||
// - Refuses to release a deployment whose AdoptedAt is set — that's a
|
||||
// production customer Sovereign, not an abandoned attempt.
|
||||
// - Calls PDM Release for the deployment's pdmPoolDomain + pdmSubdomain.
|
||||
// - Treats pdm.ErrNotFound as success (idempotent — a second call after
|
||||
// PDM has already released the slot is a no-op).
|
||||
// - Does NOT touch Hetzner (no destroy, no orphan purge), does NOT
|
||||
// delete the on-disk record, does NOT mark the deployment "wiped".
|
||||
// The operator can still inspect the failed deployment + run a
|
||||
// proper Cancel & Wipe later. The only mutation is on PDM.
|
||||
//
|
||||
// Response codes:
|
||||
//
|
||||
// 200 — release succeeded (or was a no-op)
|
||||
// 404 — unknown deployment id
|
||||
// 409 — deployment is still in-flight; cannot release while running
|
||||
// 410 — deployment was already wiped
|
||||
// 422 — deployment has been adopted by a customer; protected
|
||||
// 502 — PDM call failed (operator can retry)
|
||||
func (h *Handler) ReleaseSubdomain(w http.ResponseWriter, r *http.Request) {
|
||||
id := chi.URLParam(r, "id")
|
||||
|
||||
val, ok := h.deployments.Load(id)
|
||||
if !ok {
|
||||
http.Error(w, "deployment not found", http.StatusNotFound)
|
||||
return
|
||||
}
|
||||
dep := val.(*Deployment)
|
||||
|
||||
dep.mu.Lock()
|
||||
status := dep.Status
|
||||
poolDomain := dep.pdmPoolDomain
|
||||
subdomain := dep.pdmSubdomain
|
||||
adopted := dep.AdoptedAt != nil
|
||||
dep.mu.Unlock()
|
||||
|
||||
// Adopted deployments are customer-owned Sovereigns — never strip a
|
||||
// committed pool record from underneath an active customer.
|
||||
if adopted {
|
||||
writeJSON(w, http.StatusUnprocessableEntity, releaseSubdomainResponse{
|
||||
DeploymentID: id,
|
||||
PoolDomain: poolDomain,
|
||||
Subdomain: subdomain,
|
||||
Error: "deployment has been adopted by a customer; the pool record protects the live Sovereign and cannot be released via this endpoint",
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
// Refuse to release a deployment that is still in-flight — the
|
||||
// runProvisioning goroutine may still call Commit. Operator must
|
||||
// wait for terminal state, or run the full Cancel & Wipe flow.
|
||||
if isInFlightStatus(status) {
|
||||
writeJSON(w, http.StatusConflict, releaseSubdomainResponse{
|
||||
DeploymentID: id,
|
||||
PoolDomain: poolDomain,
|
||||
Subdomain: subdomain,
|
||||
Error: "deployment is still in-flight (status=" + status + ") — wait for terminal state or use POST /wipe",
|
||||
})
|
||||
return
|
||||
}
|
||||
if status == "wiped" {
|
||||
writeJSON(w, http.StatusGone, releaseSubdomainResponse{
|
||||
DeploymentID: id,
|
||||
PoolDomain: poolDomain,
|
||||
Subdomain: subdomain,
|
||||
Error: "deployment already wiped",
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
// Fallback FQDN split for older records that committed without
|
||||
// stamping pdmPoolDomain/pdmSubdomain (the wipe path uses the same
|
||||
// fallback; keeping behaviour symmetric).
|
||||
if poolDomain == "" || subdomain == "" {
|
||||
if idx := strings.IndexByte(dep.Request.SovereignFQDN, '.'); idx > 0 {
|
||||
subdomain = dep.Request.SovereignFQDN[:idx]
|
||||
poolDomain = dep.Request.SovereignFQDN[idx+1:]
|
||||
}
|
||||
}
|
||||
|
||||
// BYO deployments don't have a PDM allocation to release. Surface
|
||||
// that as a clean 200 no-op so wizard UI flows can call this
|
||||
// unconditionally.
|
||||
if dep.Request.SovereignDomainMode != "pool" || poolDomain == "" || subdomain == "" {
|
||||
writeJSON(w, http.StatusOK, releaseSubdomainResponse{
|
||||
DeploymentID: id,
|
||||
PoolDomain: poolDomain,
|
||||
Subdomain: subdomain,
|
||||
NoOp: "no pool allocation to release (BYO or unresolvable pool)",
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
if h.pdm == nil {
|
||||
writeJSON(w, http.StatusServiceUnavailable, releaseSubdomainResponse{
|
||||
DeploymentID: id,
|
||||
PoolDomain: poolDomain,
|
||||
Subdomain: subdomain,
|
||||
Error: "pool-domain-manager client is not configured",
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
releaseCtx, cancel := context.WithTimeout(r.Context(), 30*time.Second)
|
||||
defer cancel()
|
||||
err := h.pdm.Release(releaseCtx, poolDomain, subdomain)
|
||||
if err != nil && !errors.Is(err, pdmErrNotFound()) {
|
||||
h.log.Warn("release-subdomain: pdm release failed",
|
||||
"id", id,
|
||||
"poolDomain", poolDomain,
|
||||
"subdomain", subdomain,
|
||||
"err", err,
|
||||
)
|
||||
writeJSON(w, http.StatusBadGateway, releaseSubdomainResponse{
|
||||
DeploymentID: id,
|
||||
PoolDomain: poolDomain,
|
||||
Subdomain: subdomain,
|
||||
Error: "pdm release failed: " + err.Error(),
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
// Clear the cached PDM allocation pointers on the deployment so a
|
||||
// follow-up Cancel & Wipe doesn't try to release a slot we just
|
||||
// released (idempotent at PDM, but the cleared pointers also keep
|
||||
// /events output truthful — "no PDM allocation to release").
|
||||
dep.mu.Lock()
|
||||
dep.pdmPoolDomain = ""
|
||||
dep.pdmSubdomain = ""
|
||||
dep.pdmReservationToken = ""
|
||||
dep.mu.Unlock()
|
||||
h.persistDeployment(dep)
|
||||
|
||||
h.log.Info("release-subdomain: pdm release complete",
|
||||
"id", id,
|
||||
"poolDomain", poolDomain,
|
||||
"subdomain", subdomain,
|
||||
"priorStatus", status,
|
||||
)
|
||||
|
||||
writeJSON(w, http.StatusOK, releaseSubdomainResponse{
|
||||
DeploymentID: id,
|
||||
PoolDomain: poolDomain,
|
||||
Subdomain: subdomain,
|
||||
PDMReleased: true,
|
||||
})
|
||||
}
|
||||
|
||||
// decodeJSONBody is a thin error-wrapping helper for request bodies. Other
|
||||
// handlers in this package use json.NewDecoder directly; we wrap to
|
||||
// produce a consistent 400 message.
|
||||
|
||||
Loading…
Reference in New Issue
Block a user