banger/internal/daemon/vm_lifecycle_steps_test.go
Thales Maciel 11a33604c0
daemon: extract startVMLocked into step runner with per-step rollback
startVMLocked was a ~260-line method running 18 sequential phases
with one lumped error path: on any failure, cleanupOnErr called
cleanupRuntime — a catch-all teardown that didn't distinguish
"this phase acquired resources we should undo" from "this phase is
idempotent." The blast radius was the entire VM lifecycle. Every
tweak to boot, NAT, disk, or auth-sync orchestration had to reason
about a closure that could fire at any of 18 points.

This commit extracts the phases into a data-driven pipeline:

  - startContext threads the mutable state (vm, live, apiSock,
    dmName, tapName, etc.) through every step by pointer so step
    bodies mutate in place without returning copies.
  - startStep carries the op.stage name, optional vmCreateStage
    progress ping, optional log attrs, a run closure, and an
    optional undo closure.
  - runStartSteps walks steps in order, appends the failing step
    to the rollback set (so partial-acquire failures like
    machine.Start's post-spawn HTTP config get their undo fired),
    then iterates the rollback set in reverse and joins errors
    via errors.Join.

Each phase that acquires a resource now owns its own undo:
system_overlay removes a file it created, dm_snapshot cleans up
the loop + DM handles it set, prepare_host_features delegates to
capHooks.cleanupState, tap releases via releaseTap, metrics_file
removes the file, firecracker_launch kills the spawned PID and
drops the sockets, post_start_features calls capHooks.cleanupState
again (capability Cleanup hooks are idempotent — safe to call
whether PostStart reached every cap or not). The 11 phases with
no teardown obligation leave `undo` nil and the driver silently
skips them on rollback.

cleanupRuntime is retired from the start-failure path. It stays
intact for reconcile, stopVMLocked, killVMLocked, deleteVMLocked,
stopStaleVMs — the crash-recovery / lifecycle-teardown contract
those paths rely on is unchanged.

startVMLocked shrinks from ~225 lines of sequential-phase code
plus a cleanupOnErr closure to ~45 lines: compute derived paths,
build the step list, drive it, persist ERROR state on failure.
Stage names preserved 1:1 so existing log grep + the async-create
progress stream stay compatible.

Tests:

  - TestRunStartSteps_RollsBackInReverseOnFailure — the contract
    is pinned: succeeded-before-failing run, all their undos in
    reverse, failing step's undo also fires, original err still
    visible via errors.Is.
  - TestRunStartSteps_SkipsNilUndos — optional-undo contract.
  - TestRunStartSteps_JoinsRollbackErrors — undo failures don't
    hide the root cause.
  - TestRunStartSteps_HappyPathNoRollback — success path never
    fires any undo.

Smoke: all 21 scenarios pass, including the start-path ones
(bare vm run, workspace vm run, vm restart, vm lifecycle, vm set
reconfig) that exercise real firecracker boots end-to-end.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-23 15:34:34 -03:00

164 lines
5.4 KiB
Go

package daemon
import (
"context"
"errors"
"io"
"log/slog"
"strings"
"testing"
)
// TestRunStartSteps_RollsBackInReverseOnFailure pins the driver
// contract at the heart of commit 1's refactor: on a step failure
// (a) every step that succeeded BEFORE the failing one gets its
// undo fired in reverse order; (b) the failing step's undo also
// fires, because steps may acquire partial state before returning
// err; (c) the final error wraps both the run error and any
// rollback errors via errors.Join.
func TestRunStartSteps_RollsBackInReverseOnFailure(t *testing.T) {
s := &VMService{}
op := &operationLog{logger: slog.New(slog.NewTextHandler(io.Discard, nil))}
sc := &startContext{}
var events []string
record := func(label string) func(context.Context, *startContext) error {
return func(context.Context, *startContext) error {
events = append(events, label)
return nil
}
}
recordErr := func(label string, err error) func(context.Context, *startContext) error {
return func(context.Context, *startContext) error {
events = append(events, label)
return err
}
}
steps := []startStep{
{name: "first", run: record("run-first"), undo: record("undo-first")},
{name: "second", run: record("run-second"), undo: record("undo-second")},
{name: "third", run: recordErr("run-third", errors.New("boom")), undo: record("undo-third")},
{name: "fourth", run: record("run-fourth"), undo: record("undo-fourth")},
}
err := s.runStartSteps(context.Background(), op, sc, steps)
if err == nil || !strings.Contains(err.Error(), "boom") {
t.Fatalf("runStartSteps err = %v, want containing 'boom'", err)
}
want := []string{
// Forward run: first, second, third (fails — fourth never runs).
"run-first", "run-second", "run-third",
// Reverse undo: third, second, first. Fourth never ran so no undo-fourth.
"undo-third", "undo-second", "undo-first",
}
if len(events) != len(want) {
t.Fatalf("events length = %d, want %d:\n got: %v\n want: %v", len(events), len(want), events, want)
}
for i := range want {
if events[i] != want[i] {
t.Fatalf("events[%d] = %q, want %q\n got: %v\n want: %v", i, events[i], want[i], events, want)
}
}
}
// TestRunStartSteps_SkipsNilUndos proves the optional-undo contract:
// steps without teardown obligations leave `undo` nil and the driver
// must silently skip them during rollback rather than panicking.
func TestRunStartSteps_SkipsNilUndos(t *testing.T) {
s := &VMService{}
op := &operationLog{logger: slog.New(slog.NewTextHandler(io.Discard, nil))}
sc := &startContext{}
var undoCalls []string
undo := func(label string) func(context.Context, *startContext) error {
return func(context.Context, *startContext) error {
undoCalls = append(undoCalls, label)
return nil
}
}
noop := func(context.Context, *startContext) error { return nil }
steps := []startStep{
{name: "has-undo", run: noop, undo: undo("has-undo")},
{name: "no-undo", run: noop}, // undo nil intentionally
{name: "failing", run: func(context.Context, *startContext) error { return errors.New("x") }, undo: undo("failing")},
}
if err := s.runStartSteps(context.Background(), op, sc, steps); err == nil {
t.Fatal("runStartSteps err = nil, want failure")
}
// Rollback order: failing (acquired state, so its undo runs), no-undo
// (skipped — nil), has-undo.
want := []string{"failing", "has-undo"}
if len(undoCalls) != len(want) || undoCalls[0] != want[0] || undoCalls[1] != want[1] {
t.Fatalf("undo calls = %v, want %v", undoCalls, want)
}
}
// TestRunStartSteps_JoinsRollbackErrors asserts that undo errors are
// joined onto the original run error rather than hiding it — the
// caller must always see the root cause ("boom") even when the
// rollback path itself is messy.
func TestRunStartSteps_JoinsRollbackErrors(t *testing.T) {
s := &VMService{}
op := &operationLog{logger: slog.New(slog.NewTextHandler(io.Discard, nil))}
sc := &startContext{}
rootErr := errors.New("boom")
undoErr := errors.New("undo-fail")
steps := []startStep{
{
name: "ok",
run: func(context.Context, *startContext) error { return nil },
undo: func(context.Context, *startContext) error { return undoErr },
},
{
name: "fail",
run: func(context.Context, *startContext) error { return rootErr },
},
}
err := s.runStartSteps(context.Background(), op, sc, steps)
if err == nil {
t.Fatal("err = nil, want joined error")
}
if !errors.Is(err, rootErr) {
t.Fatalf("err does not wrap rootErr; got: %v", err)
}
if !errors.Is(err, undoErr) {
t.Fatalf("err does not wrap undoErr; got: %v", err)
}
}
// TestRunStartSteps_HappyPathNoRollback confirms that when every
// step's run returns nil, no undo fires — rollback is strictly a
// failure-path concern.
func TestRunStartSteps_HappyPathNoRollback(t *testing.T) {
s := &VMService{}
op := &operationLog{logger: slog.New(slog.NewTextHandler(io.Discard, nil))}
sc := &startContext{}
var undoCalled bool
steps := []startStep{
{
name: "a",
run: func(context.Context, *startContext) error { return nil },
undo: func(context.Context, *startContext) error { undoCalled = true; return nil },
},
{
name: "b",
run: func(context.Context, *startContext) error { return nil },
},
}
if err := s.runStartSteps(context.Background(), op, sc, steps); err != nil {
t.Fatalf("runStartSteps err = %v, want nil", err)
}
if undoCalled {
t.Fatal("undo fired on happy path — rollback must only run on failure")
}
}