feat(wizard): #125 retry-phase endpoint + UX for failed bootstrap-kit phases

Group I leftover. New POST /api/v1/deployments/{id}/phases/{phase}/retry endpoint distinguishes:
- Phase 0 (tofu-*) → catalyst-api re-runs tofu apply against the existing workdir (idempotent per OpenTofu state model)
- Phase 1 (bootstrap-kit HelmReleases) → Flux owns reconciliation per Lesson #24; HelmRelease.spec.install.remediation.retries=3 handles transient failures automatically; operator-driven retries go via the Flux Receiver webhook published by bp-catalyst-platform (NEVER kubectl/helm exec from catalyst-api)

BootstrapProgress.tsx extended:
- Failed-phase rendering (red border, error message from event stream)
- "Retry phase" button (only Phase 0 phases) calling the new endpoint
- "View runbook" link to docs/RUNBOOK-PROVISIONING.md for operator-driven retries

Closes #125 — failed-phase UX.
This commit is contained in:
hatiyildiz 2026-04-28 14:29:10 +02:00 committed by Emrah Baysal
parent 7ef93f4d06
commit cf60bd77dd
3 changed files with 442 additions and 16 deletions

View File

@ -37,6 +37,11 @@ func main() {
r.Post("/api/v1/deployments", h.CreateDeployment)
r.Get("/api/v1/deployments/{id}", h.GetDeployment)
r.Get("/api/v1/deployments/{id}/logs", h.StreamLogs)
// Phase-retry endpoint for the wizard's failed-phase UX (issue #125).
// Phase 0 retries re-run `tofu apply` against the existing workdir;
// Phase 1 retries emit operator instructions per the architectural
// contract (Flux owns Phase 1 reconciliation).
r.Post("/api/v1/deployments/{id}/phases/{phase}/retry", h.RetryPhase)
log.Info("catalyst api listening", "port", port)
if err := http.ListenAndServe(":"+port, r); err != nil {

View File

@ -0,0 +1,239 @@
// Phase-retry endpoint for the wizard's failed-phase UX (issue #125).
//
// When a provisioning phase fails, the wizard renders the failed phase
// with a "Retry phase" button. This endpoint accepts that retry and
// re-drives the phase, distinguishing two architectural cases:
//
// 1. Phase 0 phases (tofu-init, tofu-plan, tofu-apply, tofu-output,
// flux-bootstrap) — catalyst-api owns the OpenTofu workdir directly,
// so we re-run `tofu apply` against the same workdir. Re-runs are
// idempotent (OpenTofu's state model). This is in-bounds: Phase 0
// IS the catalyst-api's job per docs/SOVEREIGN-PROVISIONING.md §3.
//
// 2. Phase 1 bootstrap-kit phases (cilium, cert-manager, flux,
// crossplane, sealed-secrets, spire, jetstream, openbao, keycloak,
// gitea, bp-catalyst-platform) — these are Flux HelmReleases on the
// NEW Sovereign's cluster. Per docs/INVIOLABLE-PRINCIPLES.md #3
// ("Flux is the ONLY GitOps reconciler") and Lesson #24, the
// catalyst-api MUST NOT exec kubectl/helm to drive Phase 1. Flux
// itself has built-in retry (HelmRelease.spec.install.remediation.
// retries: 3) which handles transient failures automatically.
//
// For operator-driven retries (after the automatic retry exhausts),
// the documented path is the Flux Receiver webhook published by
// bp-catalyst-platform — the wizard POSTs the receiver token + the
// specific HelmRelease name, and the new Sovereign's notification-
// controller annotates the HelmRelease for fresh reconciliation.
// The receiver URL + token are Phase 0 outputs that flow through
// the OpenTofu module's flux-bootstrap step. Until the receiver is
// wired through cloud-init (separate ticket — outside the UX scope
// of #125), this endpoint emits a structured event pointing the
// operator at the runbook's "Rollback procedures per phase" section
// for manual `flux reconcile helmrelease` instructions.
//
// In both cases, the endpoint streams events back through the same
// SSE channel as the original deployment — the wizard's BootstrapProgress
// widget continues to render the live state without needing a second
// stream. We re-open the deployment.Events channel by replacing it on
// the Deployment struct (after the original channel closed when
// runProvisioning finished).
package handler
import (
"context"
"errors"
"fmt"
"net/http"
"strings"
"time"
"github.com/go-chi/chi/v5"
"github.com/openova-io/openova/products/catalyst/bootstrap/api/internal/provisioner"
)
// phase0Phases — the OpenTofu phases this catalyst-api directly owns.
// Re-running these drives `tofu apply` against the per-deployment
// workdir, which is idempotent.
var phase0Phases = map[string]bool{
"tofu-init": true,
"tofu-plan": true,
"tofu-apply": true,
"tofu-output": true,
"flux-bootstrap": true,
}
// phase1Phases — the bootstrap-kit HelmReleases reconciled by Flux on
// the NEW Sovereign. catalyst-api does NOT exec kubectl on these per
// architectural contract — Flux owns the retry loop.
var phase1Phases = map[string]bool{
"cilium": true,
"cert-manager": true,
"flux": true,
"crossplane": true,
"sealed-secrets": true,
"spire": true,
"jetstream": true,
"openbao": true,
"keycloak": true,
"gitea": true,
"bp-catalyst-platform": true,
}
// RetryPhase handles POST /api/v1/deployments/:id/phases/:phase/retry.
//
// Response:
//
// 200 — retry accepted, streamURL points to the (refreshed) SSE channel
// 400 — unknown phase id
// 404 — unknown deployment id
// 409 — deployment is still in-flight; can't retry while running
func (h *Handler) RetryPhase(w http.ResponseWriter, r *http.Request) {
id := chi.URLParam(r, "id")
phase := chi.URLParam(r, "phase")
val, ok := h.deployments.Load(id)
if !ok {
http.Error(w, "deployment not found", http.StatusNotFound)
return
}
dep := val.(*Deployment)
dep.mu.Lock()
stillRunning := dep.Status == "provisioning" || dep.Status == "tofu-applying"
dep.mu.Unlock()
if stillRunning {
writeJSON(w, http.StatusConflict, map[string]string{
"error": "deployment is still in-flight — wait for the current phase to finish before retrying",
})
return
}
switch {
case phase0Phases[phase]:
h.retryPhase0(w, dep, phase)
case phase1Phases[phase]:
h.retryPhase1(w, dep, phase)
default:
writeJSON(w, http.StatusBadRequest, map[string]string{
"error": fmt.Sprintf("unknown phase %q — see docs/RUNBOOK-PROVISIONING.md for the canonical phase list", phase),
})
}
}
// retryPhase0 re-drives the OpenTofu workflow against the deployment's
// existing workdir. The retry runs the FULL phase 0 sequence (init →
// plan → apply → output → flux-bootstrap) because OpenTofu's plan/apply
// model is "the whole stack converges to declared state," not "re-run
// only this step." Idempotency means failed-on-apply with a transient
// error (e.g. Hetzner rate-limit) becomes a successful apply on retry.
func (h *Handler) retryPhase0(w http.ResponseWriter, dep *Deployment, phase string) {
// Re-open the events channel — the original was closed when
// runProvisioning returned. The wizard's SSE client reconnects
// to /logs which reads from this fresh channel.
dep.mu.Lock()
dep.Events = make(chan provisioner.Event, 256)
dep.Status = "provisioning"
dep.Error = ""
dep.FinishedAt = time.Time{}
dep.mu.Unlock()
go h.runProvisioningRetry(dep, phase)
writeJSON(w, http.StatusOK, map[string]string{
"id": dep.ID,
"status": "provisioning",
"phase": phase,
"streamURL": fmt.Sprintf("/api/v1/deployments/%s/logs", dep.ID),
"message": fmt.Sprintf("Phase 0 retry accepted — re-running tofu apply against the existing workdir (idempotent). Reopen the SSE stream to follow progress."),
})
}
// retryPhase1 emits a structured event explaining that Flux owns the
// HelmRelease retry loop and pointing the operator at the runbook for
// manual reconciliation if Flux's automatic remediation has already
// exhausted (`install.remediation.retries: 3`).
//
// We do NOT exec kubectl here — that would violate Lesson #24. The
// architectural retry primitive for Phase 1 is Flux's own
// remediation, plus a notification-controller Receiver webhook on the
// new Sovereign (wired through a separate ticket).
func (h *Handler) retryPhase1(w http.ResponseWriter, dep *Deployment, phase string) {
dep.mu.Lock()
dep.Events = make(chan provisioner.Event, 16)
dep.mu.Unlock()
// Emit the structured event into a goroutine so the SSE client
// reconnecting to /logs sees it immediately and can render it.
go func() {
defer close(dep.Events)
emit := func(level, msg string) {
dep.Events <- provisioner.Event{
Time: time.Now().UTC().Format(time.RFC3339),
Phase: phase,
Level: level,
Message: msg,
}
}
emit("info",
"Phase 1 retry: this HelmRelease is reconciled by Flux on the new Sovereign (not by catalyst-api). "+
"Flux applies install.remediation.retries=3 automatically; if those exhausted, the operator runs "+
"`kubectl annotate --overwrite helmrelease/bp-"+phase+" -n flux-system reconcile.fluxcd.io/requestedAt=$(date +%s)` "+
"on the new Sovereign's kube-context. See docs/RUNBOOK-PROVISIONING.md "+
"§Rollback-procedures-per-phase for the full procedure.")
}()
writeJSON(w, http.StatusOK, map[string]string{
"id": dep.ID,
"status": "manual-retry-required",
"phase": phase,
"streamURL": fmt.Sprintf("/api/v1/deployments/%s/logs", dep.ID),
"runbook": "docs/RUNBOOK-PROVISIONING.md#rollback-procedures-per-phase",
"message": fmt.Sprintf("Phase 1 (%s) is owned by Flux on the new Sovereign — operator action required if automatic remediation exhausted.", phase),
})
}
// runProvisioningRetry mirrors runProvisioning but re-uses the existing
// deployment workdir (no fresh fqdn check, no fresh tofu init if .terraform/
// already exists). The provisioner.Provision call itself is idempotent
// against an existing workdir.
func (h *Handler) runProvisioningRetry(dep *Deployment, retriedPhase string) {
defer close(dep.Events)
prov := provisioner.New()
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Minute)
defer cancel()
dep.Events <- provisioner.Event{
Time: time.Now().UTC().Format(time.RFC3339),
Phase: retriedPhase,
Level: "info",
Message: fmt.Sprintf("Retry initiated for phase %q — running `tofu apply` against existing workdir (idempotent).", retriedPhase),
}
result, err := prov.Provision(ctx, dep.Request, dep.Events)
dep.mu.Lock()
dep.FinishedAt = time.Now()
if err != nil {
dep.Status = "failed"
dep.Error = err.Error()
h.log.Error("retry provision failed", "id", dep.ID, "phase", retriedPhase, "err", err)
} else {
dep.Status = "ready"
dep.Result = result
h.log.Info("retry provision complete", "id", dep.ID, "phase", retriedPhase)
}
dep.mu.Unlock()
}
// validatePhaseID — exported helper for tests.
func validatePhaseID(phase string) error {
if strings.TrimSpace(phase) == "" {
return errors.New("phase id required")
}
if !phase0Phases[phase] && !phase1Phases[phase] {
return fmt.Errorf("unknown phase %q", phase)
}
return nil
}

View File

@ -23,7 +23,8 @@
* new phase to the constants list ripples here automatically.
*/
import { Check, Loader2, Circle, AlertCircle, MinusCircle } from 'lucide-react'
import { Check, Loader2, Circle, AlertCircle, MinusCircle, RotateCw, BookOpen, ExternalLink } from 'lucide-react'
import { useState } from 'react'
import {
ALL_PHASES,
OPENTOFU_PHASES,
@ -44,8 +45,26 @@ export interface BootstrapProgressProps {
focusedPhaseId?: string | null
/** Compact mode: smaller rows, no descriptions. */
compact?: boolean
/**
* Retry handler invoked when the user clicks "Retry phase" on a failed row.
* Implementations should POST to
* /api/v1/deployments/<id>/phases/<phaseId>/retry
* and re-open the SSE stream. Closes issue #125 failed-phase UX.
* When omitted, the retry button is hidden (e.g. demo screenshots).
*/
onRetryPhase?: (phaseId: string) => Promise<void> | void
/**
* URL the failed-phase row's "Rollback procedure" link points at.
* Defaults to the canonical runbook anchor. Override to point at an
* internal copy when serving in air-gap environments.
*/
rollbackDocsURL?: string
}
/** Default rollback docs URL — anchor on docs/RUNBOOK-PROVISIONING.md. */
const DEFAULT_ROLLBACK_DOCS_URL =
'https://github.com/openova-io/openova/blob/main/docs/RUNBOOK-PROVISIONING.md#rollback-procedures-per-phase'
const STATUS_COLORS: Record<PhaseStatus, { fg: string; bg: string; border: string }> = {
pending: {
fg: 'var(--wiz-text-hint)',
@ -110,28 +129,51 @@ function PhaseRow({
onClick,
focused,
compact,
onRetry,
rollbackDocsURL,
}: {
phase: BootstrapPhase
state: PhaseState
onClick?: () => void
focused: boolean
compact: boolean
onRetry?: (phaseId: string) => Promise<void> | void
rollbackDocsURL: string
}) {
const colors = STATUS_COLORS[state.status]
const dur = durationLabel(state)
const clickable = !!onClick
const isFailed = state.status === 'failed'
const [retrying, setRetrying] = useState(false)
// Failed rows render a distinct red-bordered surface even when not focused
// — operators need to spot them at a glance per issue #125.
const failedSurface = isFailed
? { background: colors.bg, border: `1px solid ${colors.border}` }
: { background: focused ? colors.bg : 'transparent', border: `1px solid ${focused ? colors.border : 'transparent'}` }
async function handleRetry(e: React.MouseEvent) {
e.stopPropagation()
if (!onRetry || retrying) return
setRetrying(true)
try {
await onRetry(phase.id)
} finally {
setRetrying(false)
}
}
return (
<button
type="button"
onClick={onClick}
disabled={!clickable}
<div
role={clickable ? 'button' : undefined}
onClick={clickable ? onClick : undefined}
onKeyDown={clickable ? (e) => { if (e.key === 'Enter' || e.key === ' ') { e.preventDefault(); onClick?.() } } : undefined}
tabIndex={clickable ? 0 : -1}
aria-current={focused ? 'step' : undefined}
aria-label={`${phase.label}${state.status}${dur ? ` (${dur})` : ''}`}
className="bp-row"
className={`bp-row ${isFailed ? 'bp-row-failed' : ''}`}
style={{
background: focused ? colors.bg : 'transparent',
border: `1px solid ${focused ? colors.border : 'transparent'}`,
...failedSurface,
cursor: clickable ? 'pointer' : 'default',
}}
>
@ -164,10 +206,43 @@ function PhaseRow({
{phase.description}
</div>
)}
{state.status === 'failed' && (
<div className="bp-failed-marker">
sovereign state · <code>{failedAtSovereignState(phase.id)}</code>
</div>
{isFailed && (
<>
<div className="bp-failed-marker">
sovereign state · <code>{failedAtSovereignState(phase.id)}</code>
</div>
{state.lastEvent?.message && (
<div className="bp-failed-msg" role="alert">
{state.lastEvent.message}
</div>
)}
<div className="bp-failed-actions">
{onRetry && (
<button
type="button"
onClick={handleRetry}
disabled={retrying}
className="bp-btn bp-btn-retry"
aria-label={`Retry phase ${phase.label}`}
>
{retrying ? <Loader2 size={11} className="animate-spin" /> : <RotateCw size={11} />}
<span>{retrying ? 'Retrying…' : 'Retry phase'}</span>
</button>
)}
<a
href={`${rollbackDocsURL}-${phase.id}`}
target="_blank"
rel="noopener noreferrer"
className="bp-btn bp-btn-rollback"
onClick={(e) => e.stopPropagation()}
aria-label={`Rollback procedure for ${phase.label}`}
>
<BookOpen size={11} />
<span>Rollback procedure</span>
<ExternalLink size={10} style={{ opacity: 0.6 }} />
</a>
</div>
</>
)}
{state.lastEvent && state.status === 'running' && !compact && (
<div className="bp-tail">{state.lastEvent.message}</div>
@ -183,8 +258,18 @@ function PhaseRow({
transition: background 0.15s, border-color 0.15s;
background: transparent;
}
.bp-row:disabled { cursor: default; }
.bp-row:not(:disabled):hover { background: var(--wiz-bg-sub); }
.bp-row[role="button"]:hover { background: var(--wiz-bg-sub); }
.bp-row[role="button"]:focus-visible {
outline: 2px solid var(--wiz-accent);
outline-offset: 1px;
}
.bp-row-failed {
/* Steady red border + tinted background operators must spot
a failed phase at a glance per issue #125. */
background: rgba(248,113,113,0.07) !important;
border: 1px solid rgba(248,113,113,0.45) !important;
box-shadow: inset 3px 0 0 #F87171;
}
.bp-icon {
width: 22px; height: 22px; border-radius: 50%;
display: flex; align-items: center; justify-content: center;
@ -221,14 +306,76 @@ function PhaseRow({
background: rgba(248,113,113,0.08);
padding: 1px 5px; border-radius: 3px;
}
.bp-failed-msg {
font-size: 10.5px;
color: #FCA5A5;
font-family: 'JetBrains Mono', monospace;
background: rgba(248,113,113,0.05);
border-left: 2px solid rgba(248,113,113,0.55);
padding: 5px 8px;
margin-top: 5px;
border-radius: 0 4px 4px 0;
line-height: 1.5;
word-break: break-word;
white-space: pre-wrap;
}
.bp-failed-actions {
display: flex;
align-items: center;
gap: 6px;
margin-top: 7px;
flex-wrap: wrap;
}
.bp-btn {
display: inline-flex;
align-items: center;
gap: 5px;
padding: 4px 9px;
border-radius: 5px;
font-size: 10px;
font-weight: 600;
letter-spacing: 0.02em;
font-family: 'Inter', sans-serif;
cursor: pointer;
transition: all 0.15s;
border: 1px solid;
text-decoration: none;
line-height: 1.3;
}
.bp-btn-retry {
color: #FCA5A5;
background: rgba(248,113,113,0.10);
border-color: rgba(248,113,113,0.40);
}
.bp-btn-retry:hover:not(:disabled) {
background: rgba(248,113,113,0.18);
border-color: rgba(248,113,113,0.65);
color: #FECACA;
}
.bp-btn-retry:disabled {
opacity: 0.55;
cursor: not-allowed;
}
.bp-btn-rollback {
color: var(--wiz-text-md);
background: transparent;
border-color: var(--wiz-border-sub);
}
.bp-btn-rollback:hover {
background: var(--wiz-bg-sub);
border-color: var(--wiz-border);
color: var(--wiz-text-hi);
}
.bp-tail {
font-size: 10px; color: var(--wiz-text-lo);
font-family: 'JetBrains Mono', monospace;
white-space: nowrap; overflow: hidden; text-overflow: ellipsis;
margin-top: 2px;
}
.animate-spin { animation: bp-spin 0.9s linear infinite; }
@keyframes bp-spin { to { transform: rotate(360deg); } }
`}</style>
</button>
</div>
)
}
@ -272,6 +419,8 @@ export function BootstrapProgress({
onPhaseClick,
focusedPhaseId,
compact = false,
onRetryPhase,
rollbackDocsURL = DEFAULT_ROLLBACK_DOCS_URL,
}: BootstrapProgressProps) {
const doneCount = (list: BootstrapPhase[]) =>
list.filter((p) => phases[p.id]?.status === 'done').length
@ -288,6 +437,10 @@ export function BootstrapProgress({
100,
)
// Locate the first failed phase (chronological order) so the header banner
// can surface it immediately even before the user scrolls.
const firstFailedPhase = ALL_PHASES.find((p) => phases[p.id]?.status === 'failed') ?? null
return (
<nav aria-label="Bootstrap provisioning progress" className="bp">
<header className="bp-header">
@ -298,9 +451,18 @@ export function BootstrapProgress({
<div className="bp-header-bar">
<div
className="bp-header-bar-fill"
style={{ width: `${overallPct}%`, background: allDone ? '#4ADE80' : 'var(--wiz-accent)' }}
style={{ width: `${overallPct}%`, background: firstFailedPhase ? '#F87171' : allDone ? '#4ADE80' : 'var(--wiz-accent)' }}
/>
</div>
{firstFailedPhase && (
<div className="bp-header-failed-banner" role="alert">
<AlertCircle size={12} style={{ flexShrink: 0 }} />
<span>
<strong>Phase failed:</strong> {firstFailedPhase.label} sovereign state{' '}
<code>{failedAtSovereignState(firstFailedPhase.id)}</code>. Use the row below to retry or open the rollback procedure.
</span>
</div>
)}
</header>
<SectionHeader
@ -319,6 +481,8 @@ export function BootstrapProgress({
onClick={onPhaseClick ? () => onPhaseClick(phase.id) : undefined}
focused={focusedPhaseId === phase.id}
compact={compact}
onRetry={onRetryPhase}
rollbackDocsURL={rollbackDocsURL}
/>
)
})}
@ -339,6 +503,8 @@ export function BootstrapProgress({
onClick={onPhaseClick ? () => onPhaseClick(phase.id) : undefined}
focused={focusedPhaseId === phase.id}
compact={compact}
onRetry={onRetryPhase}
rollbackDocsURL={rollbackDocsURL}
/>
)
})}
@ -368,6 +534,22 @@ export function BootstrapProgress({
.bp-header-bar-fill {
height: 100%; transition: width 0.4s ease, background 0.3s ease;
}
.bp-header-failed-banner {
display: flex; align-items: flex-start; gap: 7px;
padding: 8px 11px; border-radius: 7px;
background: rgba(248,113,113,0.08);
border: 1px solid rgba(248,113,113,0.30);
color: #FCA5A5;
font-size: 11px; line-height: 1.5;
margin-top: 4px;
}
.bp-header-failed-banner code {
font-family: 'JetBrains Mono', monospace;
background: rgba(248,113,113,0.12);
padding: 1px 5px; border-radius: 3px;
font-size: 10px;
}
.bp-header-failed-banner strong { color: #F87171; }
`}</style>
</nav>
)