fix(catalyst-api): release PDM subdomain on Pod-restart orphan + add explicit release endpoint (closes #489) (#502)

* fix(catalyst-api): release PDM subdomain on Pod-restart orphan + add explicit release endpoint Each failed provision permanently consumed its pool subdomain in PDM — otech, otech1..otech9 stayed locked because two release seams were missing: 1. Pod-restart orphan: when catalyst-api dies mid-provisioning, the runProvisioning goroutine that would have called pdm.Release on Phase-0 failure dies with the Pod. fromRecord rewrites the rehydrated status to "failed" but nothing reaps the still-active reservation. restoreFromStore now fires a best-effort pdm.Release for every record it rewrites from in-flight to failed, gated on AdoptedAt==nil so customer-owned Sovereigns are protected. 2. Abandoned-deployment retries: the only operator-driven release path was Cancel & Wipe, which requires re-entering the HetznerToken. Franchise customers retrying under the same subdomain after a botched provision shouldn't need a Hetzner credential roundtrip for a PDM-only fix. New endpoint DELETE /api/v1/deployments/{id}/release-subdomain releases the PDM allocation only — no Hetzner work, no record deletion. Refuses in-flight (409), wiped (410), and adopted (422) deployments. Tests cover: failed-deployment release, idempotent ErrNotFound, in-flight refusal across all in-flight statuses, adopted protection, BYO no-op, 404 on unknown id, 502 on PDM transient, Pod-restart orphan release on restoreFromStore, and the negative-path proof that a clean-failed record on disk does NOT trigger a duplicate Release at restart. Closes #489 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * test(catalyst-api): fix data race in fakePDM around orphan-release goroutine The Pod-restart orphan-release path (issue #489) fires pdm.Release in a goroutine spawned by restoreFromStore. The race detector flagged the test's read of fpdm.releases against the goroutine's append. Adding a sync.Mutex to fakePDM + a snapshotReleases() accessor closes the race without changing the surface that 30+ other tests already use. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> --------- Co-authored-by: hatiyildiz <hatiyildiz@noreply.github.com> Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-02 00:48:36 +04:00 · 2026-05-02 00:48:36 +04:00 · c148ef36ff
commit c148ef36ff
parent b8c639127a
5 changed files with 686 additions and 8 deletions
--- a/products/catalyst/bootstrap/api/cmd/api/main.go
+++ b/products/catalyst/bootstrap/api/cmd/api/main.go
@ -151,6 +151,15 @@ func main() {
 	// PurgeReport summary. The wizard's failed-state banner renders the
 	// operator confirmation modal that POSTs here.
 	r.Post("/api/v1/deployments/{id}/wipe", h.WipeDeployment)
+	// Subdomain-only release endpoint (issue #489). Releases the PDM
+	// allocation row for a failed-or-abandoned deployment WITHOUT
+	// requiring the operator to re-enter their HetznerToken. Lets a
+	// franchise customer retry under the same pool subdomain after a
+	// botched provision instead of being forced to pick acmeN+1. Does
+	// NOT touch Hetzner — the Cancel & Wipe flow remains the canonical
+	// path for live cloud cleanup. Refuses on in-flight deployments
+	// (409), wiped deployments (410), or adopted Sovereigns (422).
+	r.Delete("/api/v1/deployments/{id}/release-subdomain", h.ReleaseSubdomain)
 	// Handover finalisation (issue #317). Catalyst-Zero side: stops the
 	// helmwatch informer, ships the OpenTofu state to the new Sovereign's
 	// catalyst-api, and purges every local trace once the new side
--- a/products/catalyst/bootstrap/api/internal/handler/deployments.go
+++ b/products/catalyst/bootstrap/api/internal/handler/deployments.go
@ -468,6 +468,27 @@ func (h *Handler) restoreFromStore() {
 					"err", err,
 				)
 			}
+			// Pod-restart-orphan PDM release (issue #489). The previous
+			// catalyst-api Pod owned the runProvisioning goroutine that
+			// would have called pdm.Release on Phase-0 failure. The
+			// goroutine died with the Pod; the reservation is now
+			// orphaned in PDM, locking the subdomain forever. fromRecord
+			// rewrites the status to "failed" precisely BECAUSE the
+			// previous Pod was killed mid-apply — the same condition
+			// means we MUST release the PDM slot here, otherwise a
+			// retry under the same subdomain returns 409 conflict and
+			// the franchise customer is forced to pick `acmeN+1` for
+			// what should be a routine retry.
+			//
+			// Best-effort + asynchronous: a slow PDM at startup must
+			// NOT block the rest of the rehydration loop (other
+			// deployments need to land in sync.Map quickly so /events
+			// poll works).
+			if dep.Status == "failed" && rec.PDMReservationToken != "" &&
+				rec.PDMPoolDomain != "" && rec.PDMSubdomain != "" &&
+				dep.AdoptedAt == nil && h.pdm != nil {
+				go h.releaseOrphanedReservation(dep.ID, rec.PDMPoolDomain, rec.PDMSubdomain)
+			}
 		}

 		// Resume the Phase-1 helmwatch goroutine after a Pod restart
@ -498,6 +519,49 @@ func (h *Handler) restoreFromStore() {
 	)
 }

+// releaseOrphanedReservation calls pdm.Release for a deployment whose
+// status was rewritten to "failed" because the catalyst-api Pod died
+// mid-provisioning (issue #489). Best-effort: any failure is logged
+// at warn but does not block other rehydration work. Idempotent — a
+// pdm.ErrNotFound response (the reservation already expired or was
+// previously released) is treated as success. The dep entry's PDM
+// pointers are cleared on success so a follow-up Cancel & Wipe doesn't
+// double-release.
+//
+// Run as a goroutine from restoreFromStore: the rehydration loop must
+// not block on PDM. A 30s timeout caps each call.
+func (h *Handler) releaseOrphanedReservation(deploymentID, poolDomain, subdomain string) {
+	if h.pdm == nil || poolDomain == "" || subdomain == "" {
+		return
+	}
+	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+	defer cancel()
+	err := h.pdm.Release(ctx, poolDomain, subdomain)
+	if err != nil && !errors.Is(err, pdm.ErrNotFound) {
+		h.log.Warn("orphan-release: pdm release failed; subdomain may stay locked until TTL",
+			"id", deploymentID,
+			"poolDomain", poolDomain,
+			"subdomain", subdomain,
+			"err", err,
+		)
+		return
+	}
+	h.log.Info("orphan-release: released PDM allocation for pod-restart-orphaned deployment",
+		"id", deploymentID,
+		"poolDomain", poolDomain,
+		"subdomain", subdomain,
+	)
+	if val, ok := h.deployments.Load(deploymentID); ok {
+		dep := val.(*Deployment)
+		dep.mu.Lock()
+		dep.pdmPoolDomain = ""
+		dep.pdmSubdomain = ""
+		dep.pdmReservationToken = ""
+		dep.mu.Unlock()
+		h.persistDeployment(dep)
+	}
+}
+
 // shouldResumePhase1 returns true when a rehydrated deployment is a
 // candidate for re-attaching the Phase-1 helmwatch goroutine. The
 // criteria are:
--- a/products/catalyst/bootstrap/api/internal/handler/release_subdomain_test.go
+++ b/products/catalyst/bootstrap/api/internal/handler/release_subdomain_test.go
@ -0,0 +1,390 @@
+// Tests for the subdomain-release path introduced by issue #489.
+//
+// Two independent contracts:
+//
+//  1. DELETE /api/v1/deployments/{id}/release-subdomain releases the PDM
+//     allocation row for a failed-or-abandoned deployment without
+//     requiring the operator's HetznerToken. This is the franchise-retry
+//     ergonomic — botched provisions must not permanently lock a pool
+//     subdomain.
+//
+//  2. restoreFromStore fires a best-effort PDM Release for any deployment
+//     it rewrites from "in-flight" to "failed" because of a Pod restart.
+//     The runProvisioning goroutine that would have called Release died
+//     with the previous Pod; the rehydration path is the next best seam.
+package handler
+
+import (
+	"context"
+	"encoding/json"
+	"errors"
+	"net/http"
+	"net/http/httptest"
+	"testing"
+	"time"
+
+	"github.com/go-chi/chi/v5"
+
+	"github.com/openova-io/openova/products/catalyst/bootstrap/api/internal/pdm"
+	"github.com/openova-io/openova/products/catalyst/bootstrap/api/internal/provisioner"
+	"github.com/openova-io/openova/products/catalyst/bootstrap/api/internal/store"
+)
+
+// callReleaseSubdomain wires a chi router so chi.URLParam("id") resolves
+// to the supplied id and returns the recorded response.
+func callReleaseSubdomain(h *Handler, id string) *httptest.ResponseRecorder {
+	r := chi.NewRouter()
+	r.Delete("/api/v1/deployments/{id}/release-subdomain", h.ReleaseSubdomain)
+	w := httptest.NewRecorder()
+	req := httptest.NewRequest(http.MethodDelete, "/api/v1/deployments/"+id+"/release-subdomain", nil)
+	r.ServeHTTP(w, req)
+	return w
+}
+
+// TestReleaseSubdomain_FailedDeploymentReleasesPDMSlot is the headline
+// invariant: a failed deployment with a still-locked PDM allocation gets
+// its slot released so the next attempt under the same subdomain
+// succeeds.
+func TestReleaseSubdomain_FailedDeploymentReleasesPDMSlot(t *testing.T) {
+	fpdm := &fakePDM{}
+	h := NewWithPDM(silentLogger(), fpdm)
+
+	dep := &Deployment{
+		ID:                  "dep-failed-1",
+		Status:              "failed",
+		Request:             provisioner.Request{SovereignFQDN: "otech5.omani.works", SovereignDomainMode: "pool"},
+		pdmReservationToken: "rt-otech5",
+		pdmPoolDomain:       "omani.works",
+		pdmSubdomain:        "otech5",
+	}
+	h.deployments.Store(dep.ID, dep)
+
+	w := callReleaseSubdomain(h, dep.ID)
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("status=%d body=%s", w.Code, w.Body.String())
+	}
+
+	var resp releaseSubdomainResponse
+	if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
+		t.Fatalf("decode: %v", err)
+	}
+	if !resp.PDMReleased {
+		t.Errorf("PDMReleased=false, want true (body=%s)", w.Body.String())
+	}
+	if resp.PoolDomain != "omani.works" || resp.Subdomain != "otech5" {
+		t.Errorf("response pool/sub = %q/%q, want omani.works/otech5", resp.PoolDomain, resp.Subdomain)
+	}
+
+	// PDM client received exactly one Release call with the correct args.
+	if got := len(fpdm.releases); got != 1 {
+		t.Fatalf("expected 1 PDM Release call, got %d (%+v)", got, fpdm.releases)
+	}
+	if fpdm.releases[0].pool != "omani.works" || fpdm.releases[0].sub != "otech5" {
+		t.Errorf("Release args = %+v, want omani.works/otech5", fpdm.releases[0])
+	}
+
+	// Deployment's PDM pointers were cleared so a follow-up Cancel & Wipe
+	// doesn't try to release a slot we just released.
+	if dep.pdmPoolDomain != "" || dep.pdmSubdomain != "" || dep.pdmReservationToken != "" {
+		t.Errorf("PDM pointers not cleared: pool=%q sub=%q token=%q",
+			dep.pdmPoolDomain, dep.pdmSubdomain, dep.pdmReservationToken)
+	}
+}
+
+// TestReleaseSubdomain_PDMNotFoundIsIdempotent — calling release twice
+// must succeed the second time. PDM returns 404/ErrNotFound when the
+// allocation has already been released; we treat that as success.
+func TestReleaseSubdomain_PDMNotFoundIsIdempotent(t *testing.T) {
+	fpdm := &fakePDM{
+		release: func(ctx context.Context, pool, sub string) error {
+			return pdm.ErrNotFound
+		},
+	}
+	h := NewWithPDM(silentLogger(), fpdm)
+
+	dep := &Deployment{
+		ID:            "dep-not-found",
+		Status:        "failed",
+		Request:       provisioner.Request{SovereignFQDN: "otech.omani.works", SovereignDomainMode: "pool"},
+		pdmPoolDomain: "omani.works",
+		pdmSubdomain:  "otech",
+	}
+	h.deployments.Store(dep.ID, dep)
+
+	w := callReleaseSubdomain(h, dep.ID)
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("status=%d body=%s — pdm.ErrNotFound must be treated as success", w.Code, w.Body.String())
+	}
+	var resp releaseSubdomainResponse
+	if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
+		t.Fatalf("decode: %v", err)
+	}
+	if !resp.PDMReleased {
+		t.Errorf("PDMReleased=false on ErrNotFound; idempotent contract requires true")
+	}
+}
+
+// TestReleaseSubdomain_InFlightDeploymentRefuses prevents stripping a
+// reservation out from under a runProvisioning goroutine that may yet
+// call Commit.
+func TestReleaseSubdomain_InFlightDeploymentRefuses(t *testing.T) {
+	fpdm := &fakePDM{}
+	h := NewWithPDM(silentLogger(), fpdm)
+
+	for _, status := range []string{"pending", "provisioning", "tofu-applying", "phase1-watching"} {
+		t.Run(status, func(t *testing.T) {
+			fpdm.releases = nil
+			dep := &Deployment{
+				ID:            "dep-" + status,
+				Status:        status,
+				Request:       provisioner.Request{SovereignFQDN: "otech.omani.works", SovereignDomainMode: "pool"},
+				pdmPoolDomain: "omani.works",
+				pdmSubdomain:  "otech",
+			}
+			h.deployments.Store(dep.ID, dep)
+
+			w := callReleaseSubdomain(h, dep.ID)
+
+			if w.Code != http.StatusConflict {
+				t.Errorf("status=%d, want 409 (in-flight) — body=%s", w.Code, w.Body.String())
+			}
+			if got := len(fpdm.releases); got != 0 {
+				t.Errorf("PDM Release was called for in-flight deployment (%d times)", got)
+			}
+		})
+	}
+}
+
+// TestReleaseSubdomain_AdoptedDeploymentRefuses protects customer-owned
+// Sovereigns from accidental subdomain release. The Sovereign console
+// is reachable on the committed subdomain; releasing would 404 every
+// customer request until the operator runs a re-commit.
+func TestReleaseSubdomain_AdoptedDeploymentRefuses(t *testing.T) {
+	fpdm := &fakePDM{}
+	h := NewWithPDM(silentLogger(), fpdm)
+
+	now := time.Now().UTC()
+	dep := &Deployment{
+		ID:            "dep-adopted",
+		Status:        "ready",
+		Request:       provisioner.Request{SovereignFQDN: "acme.omani.works", SovereignDomainMode: "pool"},
+		pdmPoolDomain: "omani.works",
+		pdmSubdomain:  "acme",
+		AdoptedAt:     &now,
+	}
+	h.deployments.Store(dep.ID, dep)
+
+	w := callReleaseSubdomain(h, dep.ID)
+
+	if w.Code != http.StatusUnprocessableEntity {
+		t.Errorf("status=%d, want 422 (adopted) — body=%s", w.Code, w.Body.String())
+	}
+	if got := len(fpdm.releases); got != 0 {
+		t.Errorf("PDM Release fired for adopted deployment (%d times)", got)
+	}
+}
+
+// TestReleaseSubdomain_BYODeploymentNoOp — BYO domains never have a PDM
+// allocation. Calling release should return 200 with a noOp marker so
+// wizard UI flows can call this unconditionally.
+func TestReleaseSubdomain_BYODeploymentNoOp(t *testing.T) {
+	fpdm := &fakePDM{}
+	h := NewWithPDM(silentLogger(), fpdm)
+
+	dep := &Deployment{
+		ID:      "dep-byo",
+		Status:  "failed",
+		Request: provisioner.Request{SovereignFQDN: "k8s.customer.example", SovereignDomainMode: "byo"},
+	}
+	h.deployments.Store(dep.ID, dep)
+
+	w := callReleaseSubdomain(h, dep.ID)
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("status=%d body=%s", w.Code, w.Body.String())
+	}
+	var resp releaseSubdomainResponse
+	if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
+		t.Fatalf("decode: %v", err)
+	}
+	if resp.PDMReleased {
+		t.Errorf("PDMReleased=true on BYO; want false (no allocation to release)")
+	}
+	if resp.NoOp == "" {
+		t.Errorf("NoOp empty on BYO; want explanatory marker")
+	}
+	if got := len(fpdm.releases); got != 0 {
+		t.Errorf("PDM Release fired on BYO deployment (%d times)", got)
+	}
+}
+
+// TestReleaseSubdomain_UnknownDeploymentIs404
+func TestReleaseSubdomain_UnknownDeploymentIs404(t *testing.T) {
+	h := NewWithPDM(silentLogger(), &fakePDM{})
+	w := callReleaseSubdomain(h, "no-such-deployment")
+	if w.Code != http.StatusNotFound {
+		t.Errorf("status=%d, want 404 — body=%s", w.Code, w.Body.String())
+	}
+}
+
+// TestReleaseSubdomain_PDMTransientFailureSurfaces502 — when PDM is up
+// but returns an error other than NotFound (e.g. 500 or network blip),
+// surface 502 so the operator knows to retry.
+func TestReleaseSubdomain_PDMTransientFailureSurfaces502(t *testing.T) {
+	fpdm := &fakePDM{
+		release: func(ctx context.Context, pool, sub string) error {
+			return errors.New("connection refused")
+		},
+	}
+	h := NewWithPDM(silentLogger(), fpdm)
+
+	dep := &Deployment{
+		ID:            "dep-transient",
+		Status:        "failed",
+		Request:       provisioner.Request{SovereignFQDN: "otech.omani.works", SovereignDomainMode: "pool"},
+		pdmPoolDomain: "omani.works",
+		pdmSubdomain:  "otech",
+	}
+	h.deployments.Store(dep.ID, dep)
+
+	w := callReleaseSubdomain(h, dep.ID)
+
+	if w.Code != http.StatusBadGateway {
+		t.Errorf("status=%d, want 502 — body=%s", w.Code, w.Body.String())
+	}
+}
+
+// TestRestoreFromStore_PodRestartOrphanReleasesPDMSlot covers the
+// rehydration path: when a Pod dies mid-provisioning, fromRecord rewrites
+// the on-disk status to "failed". restoreFromStore must, in that exact
+// case, fire pdm.Release for the orphaned reservation. Without this,
+// the "12-otech" symptom from issue #489 reproduces — every Pod restart
+// during provisioning permanently locks a subdomain.
+func TestRestoreFromStore_PodRestartOrphanReleasesPDMSlot(t *testing.T) {
+	dir := t.TempDir()
+	st, err := store.New(dir)
+	if err != nil {
+		t.Fatalf("store.New: %v", err)
+	}
+
+	// Seed an on-disk record that looks like a Pod was killed during
+	// `tofu apply`: status="tofu-applying" (in-flight), with a PDM
+	// reservation already taken.
+	rec := store.Record{
+		ID:                  "pod-killed-deployment",
+		Status:              "tofu-applying", // in-flight at kill time
+		PDMReservationToken: "rt-orphan",
+		PDMPoolDomain:       "omani.works",
+		PDMSubdomain:        "otech7",
+		Request: store.RedactedRequest{
+			SovereignFQDN:       "otech7.omani.works",
+			SovereignDomainMode: "pool",
+		},
+		StartedAt: time.Now().UTC(),
+	}
+	if err := st.Save(rec); err != nil {
+		t.Fatalf("seed record: %v", err)
+	}
+
+	// New PDM that records every Release call.
+	fpdm := &fakePDM{}
+
+	// NewWithStore runs restoreFromStore; the rewrite-to-failed branch
+	// fires releaseOrphanedReservation as a goroutine.
+	h := NewWithStore(silentLogger(), fpdm, st)
+
+	// Wait briefly for the async release to fire — capped at 2s. Use
+	// the mutex-protected snapshot accessor; a direct read of
+	// fpdm.releases here races the goroutine's append (race detector
+	// flagged it on first CI run before the accessor was added).
+	deadline := time.Now().Add(2 * time.Second)
+	var releases []releaseCall
+	for time.Now().Before(deadline) {
+		releases = fpdm.snapshotReleases()
+		if len(releases) > 0 {
+			break
+		}
+		time.Sleep(20 * time.Millisecond)
+	}
+
+	if got := len(releases); got != 1 {
+		t.Fatalf("expected 1 orphan PDM Release; got %d (%+v)", got, releases)
+	}
+	if releases[0].pool != "omani.works" || releases[0].sub != "otech7" {
+		t.Errorf("orphan Release args = %+v, want omani.works/otech7", releases[0])
+	}
+
+	// Deployment landed in sync.Map with status rewritten to "failed".
+	val, ok := h.deployments.Load("pod-killed-deployment")
+	if !ok {
+		t.Fatalf("deployment not in sync.Map after restore")
+	}
+	dep := val.(*Deployment)
+	if dep.Status != "failed" {
+		t.Errorf("Status=%q, want failed (Pod-restart rewrite contract)", dep.Status)
+	}
+
+	// Wait for the success-path field clear (the goroutine clears the
+	// pointers + persists after a successful release).
+	deadline = time.Now().Add(2 * time.Second)
+	for time.Now().Before(deadline) {
+		dep.mu.Lock()
+		cleared := dep.pdmPoolDomain == "" && dep.pdmSubdomain == ""
+		dep.mu.Unlock()
+		if cleared {
+			break
+		}
+		time.Sleep(20 * time.Millisecond)
+	}
+	dep.mu.Lock()
+	defer dep.mu.Unlock()
+	if dep.pdmPoolDomain != "" || dep.pdmSubdomain != "" {
+		t.Errorf("PDM pointers not cleared after orphan-release: pool=%q sub=%q",
+			dep.pdmPoolDomain, dep.pdmSubdomain)
+	}
+}
+
+// TestRestoreFromStore_TerminalRecordDoesNotReleasePDM proves we DON'T
+// re-release a deployment that finished cleanly (status="failed" was
+// already the terminal state on disk, not a rewrite). A clean failure
+// already had its release path run by the original Pod's
+// runProvisioning goroutine; firing a second Release at restart time
+// would be a noisy duplicate.
+func TestRestoreFromStore_TerminalRecordDoesNotReleasePDM(t *testing.T) {
+	dir := t.TempDir()
+	st, err := store.New(dir)
+	if err != nil {
+		t.Fatalf("store.New: %v", err)
+	}
+
+	// Record was already "failed" on disk — NOT an in-flight kill.
+	rec := store.Record{
+		ID:                  "clean-fail",
+		Status:              "failed",
+		PDMReservationToken: "rt-clean",
+		PDMPoolDomain:       "omani.works",
+		PDMSubdomain:        "otech-clean",
+		Request: store.RedactedRequest{
+			SovereignFQDN:       "otech-clean.omani.works",
+			SovereignDomainMode: "pool",
+		},
+		StartedAt: time.Now().UTC(),
+	}
+	if err := st.Save(rec); err != nil {
+		t.Fatalf("seed record: %v", err)
+	}
+
+	fpdm := &fakePDM{}
+	_ = NewWithStore(silentLogger(), fpdm, st)
+
+	// Allow a generous window for any spurious goroutine to fire.
+	time.Sleep(150 * time.Millisecond)
+
+	releases := fpdm.snapshotReleases()
+	if got := len(releases); got != 0 {
+		t.Errorf("expected 0 PDM Release calls for already-terminal record, got %d (%+v)",
+			got, releases)
+	}
+}
--- a/products/catalyst/bootstrap/api/internal/handler/subdomains_test.go
+++ b/products/catalyst/bootstrap/api/internal/handler/subdomains_test.go
@ -23,6 +23,7 @@ import (
 	"net/http"
 	"net/http/httptest"
 	"strings"
+	"sync"
 	"testing"

 	"github.com/openova-io/openova/products/catalyst/bootstrap/api/internal/pdm"
@ -30,7 +31,13 @@ import (

 // fakePDM is a stub pdmClient that records every call. We assert against the
 // recorded calls to prove the behaviour the architecture requires.
+//
+// mu guards the recorded-call slices because the orphan-release path
+// introduced by issue #489 fires pdm.Release in a goroutine spawned by
+// restoreFromStore. Without the mutex, -race flags the slice append in
+// Release racing the test's read.
 type fakePDM struct {
+	mu       sync.Mutex
 	checks   []checkCall
 	check    func(ctx context.Context, pool, sub string) (*pdm.CheckResult, error)
 	reserves []reserveCall
@ -41,22 +48,40 @@ type fakePDM struct {
 	release  func(ctx context.Context, pool, sub string) error
 }

+// snapshotReleases returns a copy of the recorded Release calls under
+// the mutex. Tests that race goroutines against the recorder MUST use
+// this instead of reading `f.releases` directly, otherwise -race flags
+// the read.
+func (f *fakePDM) snapshotReleases() []releaseCall {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	out := make([]releaseCall, len(f.releases))
+	copy(out, f.releases)
+	return out
+}
+
 type checkCall struct{ pool, sub string }
 type reserveCall struct{ pool, sub, by string }
 type releaseCall struct{ pool, sub string }

 func (f *fakePDM) Check(ctx context.Context, pool, sub string) (*pdm.CheckResult, error) {
+	f.mu.Lock()
 	f.checks = append(f.checks, checkCall{pool, sub})
-	if f.check != nil {
-		return f.check(ctx, pool, sub)
+	cb := f.check
+	f.mu.Unlock()
+	if cb != nil {
+		return cb(ctx, pool, sub)
 	}
 	return &pdm.CheckResult{Available: true, FQDN: sub + "." + pool}, nil
 }

 func (f *fakePDM) Reserve(ctx context.Context, pool, sub, by string) (*pdm.Reservation, error) {
+	f.mu.Lock()
 	f.reserves = append(f.reserves, reserveCall{pool, sub, by})
-	if f.reserve != nil {
-		return f.reserve(ctx, pool, sub, by)
+	cb := f.reserve
+	f.mu.Unlock()
+	if cb != nil {
+		return cb(ctx, pool, sub, by)
 	}
 	return &pdm.Reservation{
 		PoolDomain: pool, Subdomain: sub, State: "reserved",
@ -65,17 +90,23 @@ func (f *fakePDM) Reserve(ctx context.Context, pool, sub, by string) (*pdm.Reser
 }

 func (f *fakePDM) Commit(ctx context.Context, pool string, in pdm.CommitInput) error {
+	f.mu.Lock()
 	f.commits = append(f.commits, in)
-	if f.commit != nil {
-		return f.commit(ctx, pool, in)
+	cb := f.commit
+	f.mu.Unlock()
+	if cb != nil {
+		return cb(ctx, pool, in)
 	}
 	return nil
 }

 func (f *fakePDM) Release(ctx context.Context, pool, sub string) error {
+	f.mu.Lock()
 	f.releases = append(f.releases, releaseCall{pool, sub})
-	if f.release != nil {
-		return f.release(ctx, pool, sub)
+	cb := f.release
+	f.mu.Unlock()
+	if cb != nil {
+		return cb(ctx, pool, sub)
 	}
 	return nil
 }
--- a/products/catalyst/bootstrap/api/internal/handler/wipe.go
+++ b/products/catalyst/bootstrap/api/internal/handler/wipe.go
@ -46,9 +46,15 @@ import (
 	"github.com/go-chi/chi/v5"

 	"github.com/openova-io/openova/products/catalyst/bootstrap/api/internal/hetzner"
+	"github.com/openova-io/openova/products/catalyst/bootstrap/api/internal/pdm"
 	"github.com/openova-io/openova/products/catalyst/bootstrap/api/internal/provisioner"
 )

+// pdmErrNotFound returns the sentinel value to compare against PDM Release
+// outcomes. Wrapped in a function so callers can use errors.Is without
+// importing the pdm package at every call site.
+func pdmErrNotFound() error { return pdm.ErrNotFound }
+
 // wipeRequest is the body of POST /api/v1/deployments/{id}/wipe.
 //
 // HetznerToken is required ONLY when the on-disk Deployment record's
@ -294,6 +300,184 @@ func deploymentSovereignName(fqdn string) string {
 	return strings.ReplaceAll(fqdn, ".", "-")
 }

+// releaseSubdomainResponse is the wire shape of DELETE
+// /api/v1/deployments/{id}/release-subdomain — a subdomain-only release
+// path that does NOT require the HetznerToken (issue #489). The full
+// Cancel & Wipe flow remains the canonical purge for live Hetzner
+// resources; this endpoint is the narrower fix for the case where a
+// failed-or-abandoned deployment locks a pool subdomain that an
+// operator wants to retry under the SAME name.
+type releaseSubdomainResponse struct {
+	DeploymentID string `json:"deploymentId"`
+	PoolDomain   string `json:"poolDomain"`
+	Subdomain    string `json:"subdomain"`
+	PDMReleased  bool   `json:"pdmReleased"`
+	NoOp         string `json:"noOp,omitempty"`
+	Error        string `json:"error,omitempty"`
+}
+
+// ReleaseSubdomain handles DELETE /api/v1/deployments/{id}/release-subdomain.
+//
+// Issue #489 — each failed provision permanently consumed its subdomain
+// because the only release seams were (a) runProvisioning's post-Phase-0
+// failure path, dead after a Pod restart, and (b) the Cancel & Wipe flow
+// which requires the operator to re-enter the HetznerToken. For franchise
+// customers a retry of `acme.omani.works` after a failed `acme.omani.works`
+// must NOT need a new subdomain or a HetznerToken roundtrip — the slot
+// belongs to the customer, not to the failed attempt.
+//
+// This endpoint:
+//
+//   - Looks up the deployment by id.
+//   - Refuses to release a deployment that is still in-flight (operator
+//     must wait or wipe properly).
+//   - Refuses to release a deployment whose AdoptedAt is set — that's a
+//     production customer Sovereign, not an abandoned attempt.
+//   - Calls PDM Release for the deployment's pdmPoolDomain + pdmSubdomain.
+//   - Treats pdm.ErrNotFound as success (idempotent — a second call after
+//     PDM has already released the slot is a no-op).
+//   - Does NOT touch Hetzner (no destroy, no orphan purge), does NOT
+//     delete the on-disk record, does NOT mark the deployment "wiped".
+//     The operator can still inspect the failed deployment + run a
+//     proper Cancel & Wipe later. The only mutation is on PDM.
+//
+// Response codes:
+//
+//	200 — release succeeded (or was a no-op)
+//	404 — unknown deployment id
+//	409 — deployment is still in-flight; cannot release while running
+//	410 — deployment was already wiped
+//	422 — deployment has been adopted by a customer; protected
+//	502 — PDM call failed (operator can retry)
+func (h *Handler) ReleaseSubdomain(w http.ResponseWriter, r *http.Request) {
+	id := chi.URLParam(r, "id")
+
+	val, ok := h.deployments.Load(id)
+	if !ok {
+		http.Error(w, "deployment not found", http.StatusNotFound)
+		return
+	}
+	dep := val.(*Deployment)
+
+	dep.mu.Lock()
+	status := dep.Status
+	poolDomain := dep.pdmPoolDomain
+	subdomain := dep.pdmSubdomain
+	adopted := dep.AdoptedAt != nil
+	dep.mu.Unlock()
+
+	// Adopted deployments are customer-owned Sovereigns — never strip a
+	// committed pool record from underneath an active customer.
+	if adopted {
+		writeJSON(w, http.StatusUnprocessableEntity, releaseSubdomainResponse{
+			DeploymentID: id,
+			PoolDomain:   poolDomain,
+			Subdomain:    subdomain,
+			Error:        "deployment has been adopted by a customer; the pool record protects the live Sovereign and cannot be released via this endpoint",
+		})
+		return
+	}
+
+	// Refuse to release a deployment that is still in-flight — the
+	// runProvisioning goroutine may still call Commit. Operator must
+	// wait for terminal state, or run the full Cancel & Wipe flow.
+	if isInFlightStatus(status) {
+		writeJSON(w, http.StatusConflict, releaseSubdomainResponse{
+			DeploymentID: id,
+			PoolDomain:   poolDomain,
+			Subdomain:    subdomain,
+			Error:        "deployment is still in-flight (status=" + status + ") — wait for terminal state or use POST /wipe",
+		})
+		return
+	}
+	if status == "wiped" {
+		writeJSON(w, http.StatusGone, releaseSubdomainResponse{
+			DeploymentID: id,
+			PoolDomain:   poolDomain,
+			Subdomain:    subdomain,
+			Error:        "deployment already wiped",
+		})
+		return
+	}
+
+	// Fallback FQDN split for older records that committed without
+	// stamping pdmPoolDomain/pdmSubdomain (the wipe path uses the same
+	// fallback; keeping behaviour symmetric).
+	if poolDomain == "" || subdomain == "" {
+		if idx := strings.IndexByte(dep.Request.SovereignFQDN, '.'); idx > 0 {
+			subdomain = dep.Request.SovereignFQDN[:idx]
+			poolDomain = dep.Request.SovereignFQDN[idx+1:]
+		}
+	}
+
+	// BYO deployments don't have a PDM allocation to release. Surface
+	// that as a clean 200 no-op so wizard UI flows can call this
+	// unconditionally.
+	if dep.Request.SovereignDomainMode != "pool" || poolDomain == "" || subdomain == "" {
+		writeJSON(w, http.StatusOK, releaseSubdomainResponse{
+			DeploymentID: id,
+			PoolDomain:   poolDomain,
+			Subdomain:    subdomain,
+			NoOp:         "no pool allocation to release (BYO or unresolvable pool)",
+		})
+		return
+	}
+
+	if h.pdm == nil {
+		writeJSON(w, http.StatusServiceUnavailable, releaseSubdomainResponse{
+			DeploymentID: id,
+			PoolDomain:   poolDomain,
+			Subdomain:    subdomain,
+			Error:        "pool-domain-manager client is not configured",
+		})
+		return
+	}
+
+	releaseCtx, cancel := context.WithTimeout(r.Context(), 30*time.Second)
+	defer cancel()
+	err := h.pdm.Release(releaseCtx, poolDomain, subdomain)
+	if err != nil && !errors.Is(err, pdmErrNotFound()) {
+		h.log.Warn("release-subdomain: pdm release failed",
+			"id", id,
+			"poolDomain", poolDomain,
+			"subdomain", subdomain,
+			"err", err,
+		)
+		writeJSON(w, http.StatusBadGateway, releaseSubdomainResponse{
+			DeploymentID: id,
+			PoolDomain:   poolDomain,
+			Subdomain:    subdomain,
+			Error:        "pdm release failed: " + err.Error(),
+		})
+		return
+	}
+
+	// Clear the cached PDM allocation pointers on the deployment so a
+	// follow-up Cancel & Wipe doesn't try to release a slot we just
+	// released (idempotent at PDM, but the cleared pointers also keep
+	// /events output truthful — "no PDM allocation to release").
+	dep.mu.Lock()
+	dep.pdmPoolDomain = ""
+	dep.pdmSubdomain = ""
+	dep.pdmReservationToken = ""
+	dep.mu.Unlock()
+	h.persistDeployment(dep)
+
+	h.log.Info("release-subdomain: pdm release complete",
+		"id", id,
+		"poolDomain", poolDomain,
+		"subdomain", subdomain,
+		"priorStatus", status,
+	)
+
+	writeJSON(w, http.StatusOK, releaseSubdomainResponse{
+		DeploymentID: id,
+		PoolDomain:   poolDomain,
+		Subdomain:    subdomain,
+		PDMReleased:  true,
+	})
+}
+
 // decodeJSONBody is a thin error-wrapping helper for request bodies. Other
 // handlers in this package use json.NewDecoder directly; we wrap to
 // produce a consistent 400 message.