daemon: extract startVMLocked into step runner with per-step rollback

startVMLocked was a ~260-line method running 18 sequential phases with one lumped error path: on any failure, cleanupOnErr called cleanupRuntime — a catch-all teardown that didn't distinguish "this phase acquired resources we should undo" from "this phase is idempotent." The blast radius was the entire VM lifecycle. Every tweak to boot, NAT, disk, or auth-sync orchestration had to reason about a closure that could fire at any of 18 points. This commit extracts the phases into a data-driven pipeline: - startContext threads the mutable state (vm, live, apiSock, dmName, tapName, etc.) through every step by pointer so step bodies mutate in place without returning copies. - startStep carries the op.stage name, optional vmCreateStage progress ping, optional log attrs, a run closure, and an optional undo closure. - runStartSteps walks steps in order, appends the failing step to the rollback set (so partial-acquire failures like machine.Start's post-spawn HTTP config get their undo fired), then iterates the rollback set in reverse and joins errors via errors.Join. Each phase that acquires a resource now owns its own undo: system_overlay removes a file it created, dm_snapshot cleans up the loop + DM handles it set, prepare_host_features delegates to capHooks.cleanupState, tap releases via releaseTap, metrics_file removes the file, firecracker_launch kills the spawned PID and drops the sockets, post_start_features calls capHooks.cleanupState again (capability Cleanup hooks are idempotent — safe to call whether PostStart reached every cap or not). The 11 phases with no teardown obligation leave `undo` nil and the driver silently skips them on rollback. cleanupRuntime is retired from the start-failure path. It stays intact for reconcile, stopVMLocked, killVMLocked, deleteVMLocked, stopStaleVMs — the crash-recovery / lifecycle-teardown contract those paths rely on is unchanged. startVMLocked shrinks from ~225 lines of sequential-phase code plus a cleanupOnErr closure to ~45 lines: compute derived paths, build the step list, drive it, persist ERROR state on failure. Stage names preserved 1:1 so existing log grep + the async-create progress stream stay compatible. Tests: - TestRunStartSteps_RollsBackInReverseOnFailure — the contract is pinned: succeeded-before-failing run, all their undos in reverse, failing step's undo also fires, original err still visible via errors.Is. - TestRunStartSteps_SkipsNilUndos — optional-undo contract. - TestRunStartSteps_JoinsRollbackErrors — undo failures don't hide the root cause. - TestRunStartSteps_HappyPathNoRollback — success path never fires any undo. Smoke: all 21 scenarios pass, including the start-path ones (bare vm run, workspace vm run, vm restart, vm lifecycle, vm set reconfig) that exercise real firecracker boots end-to-end. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-23 15:34:34 -03:00 · 2026-04-23 15:34:34 -03:00 · 11a33604c0
commit 11a33604c0
parent 2ebd2b64bb
3 changed files with 623 additions and 194 deletions
--- a/internal/daemon/vm_lifecycle.go
+++ b/internal/daemon/vm_lifecycle.go
@ -3,7 +3,6 @@ package daemon
 import (
 	"context"
 	"errors"
-	"fmt"
 	"os"
 	"path/filepath"
 	"strconv"
@ -11,8 +10,6 @@ import (
 	"time"

 	"banger/internal/api"
-	"banger/internal/firecracker"
-	"banger/internal/imagepull"
 	"banger/internal/model"
 	"banger/internal/system"
 )
@ -43,28 +40,10 @@ func (s *VMService) startVMLocked(ctx context.Context, vm model.VMRecord, image
 		}
 		op.done(vmLogAttrs(vm)...)
 	}()
-	op.stage("preflight")
-	vmCreateStage(ctx, "preflight", "checking host prerequisites")
-	if err := s.validateStartPrereqs(ctx, vm, image); err != nil {
-		return model.VMRecord{}, err
-	}
-	if err := os.MkdirAll(vm.Runtime.VMDir, 0o755); err != nil {
-		return model.VMRecord{}, err
-	}
-	op.stage("cleanup_runtime")
-	if err := s.cleanupRuntime(ctx, vm, true); err != nil {
-		return model.VMRecord{}, err
-	}
-	s.clearVMHandles(vm)
-	op.stage("bridge")
-	if err := s.net.ensureBridge(ctx); err != nil {
-		return model.VMRecord{}, err
-	}
-	op.stage("socket_dir")
-	if err := s.net.ensureSocketDir(); err != nil {
-		return model.VMRecord{}, err
-	}

+	// Derive per-VM paths/names up front so every step sees the same
+	// values. Shortening vm.ID mirrors how the pre-refactor inline
+	// code did it.
 	shortID := system.ShortID(vm.ID)
 	apiSock := filepath.Join(s.layout.RuntimeDir, "fc-"+shortID+".sock")
 	dmName := "fc-rootfs-" + shortID
@ -78,183 +57,35 @@ func (s *VMService) startVMLocked(ctx context.Context, vm model.VMRecord, image
 			return model.VMRecord{}, err
 		}
 	}
-	if err := os.RemoveAll(apiSock); err != nil && !os.IsNotExist(err) {
-		return model.VMRecord{}, err
-	}
-	if err := os.RemoveAll(vm.Runtime.VSockPath); err != nil && !os.IsNotExist(err) {
-		return model.VMRecord{}, err
+
+	live := model.VMHandles{}
+	sc := &startContext{
+		vm:      &vm,
+		image:   image,
+		live:    &live,
+		apiSock: apiSock,
+		dmName:  dmName,
+		tapName: tapName,
 	}

-	op.stage("system_overlay", "overlay_path", vm.Runtime.SystemOverlay)
-	vmCreateStage(ctx, "prepare_rootfs", "preparing system overlay")
-	if err := s.ensureSystemOverlay(ctx, &vm); err != nil {
-		return model.VMRecord{}, err
-	}
-
-	op.stage("dm_snapshot", "dm_name", dmName)
-	vmCreateStage(ctx, "prepare_rootfs", "creating root filesystem snapshot")
-	snapHandles, err := s.net.createDMSnapshot(ctx, image.RootfsPath, vm.Runtime.SystemOverlay, dmName)
-	if err != nil {
-		return model.VMRecord{}, err
-	}
-	// Live handles are threaded through this function as a local and
-	// pushed to the cache via setVMHandles once we have every piece.
-	// The cache update must happen BEFORE any step that reads handles
-	// back (e.g. cleanupRuntime via cleanupOnErr) — otherwise loops
-	// and DM would leak on an early failure.
-	live := model.VMHandles{
-		BaseLoop: snapHandles.BaseLoop,
-		COWLoop:  snapHandles.COWLoop,
-		DMName:   snapHandles.DMName,
-		DMDev:    snapHandles.DMDev,
-	}
-	s.setVMHandles(vm, live)
-
-	vm.Runtime.APISockPath = apiSock
-	vm.Runtime.State = model.VMStateRunning
-	vm.State = model.VMStateRunning
-	vm.Runtime.LastError = ""
-
-	cleanupOnErr := func(err error) (model.VMRecord, error) {
+	if runErr := s.runStartSteps(ctx, op, sc, s.buildStartSteps(op, sc)); runErr != nil {
+		// The step driver already ran rollback in reverse for every
+		// succeeded step. All that's left is to persist the ERROR
+		// state so operators see the failure via `vm show`. Use a
+		// fresh context in case the request ctx is cancelled — DB
+		// writes past this point are recovery, not user-driven.
+		//
+		// The store check is for tests that construct a bare Daemon
+		// without a DB; production always has s.store non-nil.
 		vm.State = model.VMStateError
 		vm.Runtime.State = model.VMStateError
-		vm.Runtime.LastError = err.Error()
-		op.stage("cleanup_after_failure", "error", err.Error())
-		if cleanupErr := s.cleanupRuntime(context.Background(), vm, true); cleanupErr != nil {
-			err = errors.Join(err, cleanupErr)
-		}
+		vm.Runtime.LastError = runErr.Error()
 		vm.Runtime.TapDevice = ""
 		s.clearVMHandles(vm)
-		_ = s.store.UpsertVM(context.Background(), vm)
-		return model.VMRecord{}, err
-	}
-
-	// On a restart the COW already holds writes from a previous guest
-	// boot — stale free-inode / free-block counters, possibly unwritten
-	// journal updates. e2fsprogs (e2cp/e2rm, used by patchRootOverlay)
-	// refuses to touch the snapshot with "Inode bitmap checksum does
-	// not match bitmap", which bubbles up as a "start failed" even
-	// though the filesystem is kernel-valid. `e2fsck -fy` reconciles
-	// the bitmaps and is a no-op on a fresh snapshot, so running it
-	// unconditionally keeps the code path the same for first vs.
-	// subsequent starts. Exit code 1 means "errors fixed" — we treat
-	// that as success.
-	op.stage("fsck_snapshot")
-	if _, err := s.runner.RunSudo(ctx, "e2fsck", "-fy", live.DMDev); err != nil {
-		// e2fsck exit codes: 0=clean, 1=errors corrected, 2=reboot
-		// needed, 4+=uncorrected. -1 means the error wasn't an
-		// exec.ExitError (e.g. command not found, ctx cancel).
-		if code := system.ExitCode(err); code < 0 || code > 1 {
-			return cleanupOnErr(fmt.Errorf("fsck snapshot: %w", err))
+		if s.store != nil {
+			_ = s.store.UpsertVM(context.Background(), vm)
 		}
-	}
-
-	op.stage("patch_root_overlay")
-	vmCreateStage(ctx, "prepare_rootfs", "writing guest configuration")
-	if err := s.patchRootOverlay(ctx, vm, image); err != nil {
-		return cleanupOnErr(err)
-	}
-	op.stage("prepare_host_features")
-	vmCreateStage(ctx, "prepare_host_features", "preparing host-side vm features")
-	if err := s.capHooks.prepareHosts(ctx, &vm, image); err != nil {
-		return cleanupOnErr(err)
-	}
-	op.stage("tap")
-	tap, err := s.net.acquireTap(ctx, tapName)
-	if err != nil {
-		return cleanupOnErr(err)
-	}
-	live.TapDevice = tap
-	s.setVMHandles(vm, live)
-	// Mirror onto VM.Runtime so NAT teardown can recover the tap
-	// name from the DB even if the handle cache is empty (daemon
-	// crash + restart, corrupt handles.json).
-	vm.Runtime.TapDevice = tap
-	op.stage("metrics_file", "metrics_path", vm.Runtime.MetricsPath)
-	if err := os.WriteFile(vm.Runtime.MetricsPath, nil, 0o644); err != nil {
-		return cleanupOnErr(err)
-	}
-
-	op.stage("firecracker_binary")
-	fcPath, err := s.net.firecrackerBinary()
-	if err != nil {
-		return cleanupOnErr(err)
-	}
-	op.stage("firecracker_launch", "log_path", vm.Runtime.LogPath, "metrics_path", vm.Runtime.MetricsPath)
-	vmCreateStage(ctx, "boot_firecracker", "starting firecracker")
-	kernelArgs := system.BuildBootArgs(vm.Name)
-	if strings.TrimSpace(image.InitrdPath) == "" {
-		// Direct-boot image (no initramfs) — the rootfs may be a
-		// container image without /sbin/init or iproute2. Use:
-		// 1. Kernel-level IP config via ip= cmdline (CONFIG_IP_PNP),
-		//    so the network is up before init runs — no ip(8) needed.
-		// 2. init= pointing at our universal wrapper which installs
-		//    systemd+sshd on first boot if missing.
-		kernelArgs = system.BuildBootArgsWithKernelIP(
-			vm.Name, vm.Runtime.GuestIP, s.config.BridgeIP, s.config.DefaultDNS,
-		) + " init=" + imagepull.FirstBootScriptPath
-	}
-
-	machineConfig := firecracker.MachineConfig{
-		BinaryPath:      fcPath,
-		VMID:            vm.ID,
-		SocketPath:      apiSock,
-		LogPath:         vm.Runtime.LogPath,
-		MetricsPath:     vm.Runtime.MetricsPath,
-		KernelImagePath: image.KernelPath,
-		InitrdPath:      image.InitrdPath,
-		KernelArgs:      kernelArgs,
-		Drives: []firecracker.DriveConfig{{
-			ID:       "rootfs",
-			Path:     live.DMDev,
-			ReadOnly: false,
-			IsRoot:   true,
-		}},
-		TapDevice: tap,
-		VSockPath: vm.Runtime.VSockPath,
-		VSockCID:  vm.Runtime.VSockCID,
-		VCPUCount: vm.Spec.VCPUCount,
-		MemoryMiB: vm.Spec.MemoryMiB,
-		Logger:    s.logger,
-	}
-	s.capHooks.contributeMachine(&machineConfig, vm, image)
-	machine, err := firecracker.NewMachine(ctx, machineConfig)
-	if err != nil {
-		return cleanupOnErr(err)
-	}
-	if err := machine.Start(ctx); err != nil {
-		// Use a fresh context: the request ctx may already be cancelled (client
-		// disconnect), but we still need the PID so cleanupRuntime can kill the
-		// Firecracker process that was spawned before the failure.
-		live.PID = s.net.resolveFirecrackerPID(context.Background(), machine, apiSock)
-		s.setVMHandles(vm, live)
-		return cleanupOnErr(err)
-	}
-	live.PID = s.net.resolveFirecrackerPID(context.Background(), machine, apiSock)
-	s.setVMHandles(vm, live)
-	op.debugStage("firecracker_started", "pid", live.PID)
-	op.stage("socket_access", "api_socket", apiSock)
-	if err := s.net.ensureSocketAccess(ctx, apiSock, "firecracker api socket"); err != nil {
-		return cleanupOnErr(err)
-	}
-	op.stage("vsock_access", "vsock_path", vm.Runtime.VSockPath, "vsock_cid", vm.Runtime.VSockCID)
-	if err := s.net.ensureSocketAccess(ctx, vm.Runtime.VSockPath, "firecracker vsock socket"); err != nil {
-		return cleanupOnErr(err)
-	}
-	vmCreateStage(ctx, "wait_vsock_agent", "waiting for guest vsock agent")
-	if err := s.net.waitForGuestVSockAgent(ctx, vm.Runtime.VSockPath, vsockReadyWait); err != nil {
-		return cleanupOnErr(err)
-	}
-	op.stage("post_start_features")
-	vmCreateStage(ctx, "wait_guest_ready", "waiting for guest services")
-	if err := s.capHooks.postStart(ctx, vm, image); err != nil {
-		return cleanupOnErr(err)
-	}
-	system.TouchNow(&vm)
-	op.stage("persist")
-	vmCreateStage(ctx, "finalize", "saving vm state")
-	if err := s.store.UpsertVM(ctx, vm); err != nil {
-		return cleanupOnErr(err)
+		return model.VMRecord{}, runErr
 	}
 	return vm, nil
 }
--- a/internal/daemon/vm_lifecycle_steps.go
+++ b/internal/daemon/vm_lifecycle_steps.go
@ -0,0 +1,434 @@
+package daemon
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"os"
+	"strings"
+
+	"banger/internal/firecracker"
+	"banger/internal/imagepull"
+	"banger/internal/model"
+	"banger/internal/system"
+)
+
+// buildKernelArgs assembles the kernel command line for a start.
+// Direct-boot images (no initrd) get kernel-level IP config so the
+// network is up before init, plus init= pointing at the universal
+// first-boot wrapper. Anything else uses the plain variant.
+func buildKernelArgs(vm model.VMRecord, image model.Image, bridgeIP, defaultDNS string) string {
+	if strings.TrimSpace(image.InitrdPath) == "" {
+		return system.BuildBootArgsWithKernelIP(
+			vm.Name, vm.Runtime.GuestIP, bridgeIP, defaultDNS,
+		) + " init=" + imagepull.FirstBootScriptPath
+	}
+	return system.BuildBootArgs(vm.Name)
+}
+
+// startContext is the mutable state threaded through every start
+// step. `vm` and `live` are pointers so steps mutate in place —
+// dodges returning redundant copies and keeps step bodies readable.
+// Values computed by `startVMLocked` before the driver runs
+// (apiSock, dmName, tapName) live here too so each step can read
+// them without rederiving.
+type startContext struct {
+	vm      *model.VMRecord
+	image   model.Image
+	live    *model.VMHandles
+	apiSock string
+	dmName  string
+	tapName string
+	fcPath  string
+	machine *firecracker.Machine
+
+	// systemOverlayCreated records whether the system_overlay step
+	// actually created the file (vs. the file existing from a crashed
+	// prior attempt). The undo honours it so a leftover-but-valid
+	// overlay isn't deleted under us.
+	systemOverlayCreated bool
+}
+
+// startStep is one phase in the start-VM pipeline. Phases with no
+// rollback obligation leave `undo` nil — the driver simply skips
+// them on the rollback path. `createStage` / `createDetail` are
+// forwarded to `vmCreateStage` so the async-create RPC caller sees
+// progress; they're "" for phases that were never part of the
+// user-facing progress stream.
+type startStep struct {
+	name         string
+	attrs        []any
+	createStage  string
+	createDetail string
+	run          func(ctx context.Context, sc *startContext) error
+	undo         func(ctx context.Context, sc *startContext) error
+}
+
+// runStartSteps walks steps in order, logging each via `op.stage`
+// (and `vmCreateStage` when the step opted in). On the first
+// run-err, it rolls back the prefix (including the failing step, so
+// a step that acquired resources before erroring gets its undo
+// fired) and returns the original err joined with any rollback err.
+//
+// Contract: `undo` must be safe to call even when `run` returned
+// an error — check zero-value guards rather than assuming success.
+// This is cheaper than a two-phase acquire/commit per step and
+// matches how `cleanupPreparedCapabilities` in capabilities.go
+// treats partial-success rollback.
+func (s *VMService) runStartSteps(ctx context.Context, op *operationLog, sc *startContext, steps []startStep) error {
+	done := make([]startStep, 0, len(steps))
+	for _, step := range steps {
+		if step.createStage != "" {
+			vmCreateStage(ctx, step.createStage, step.createDetail)
+		}
+		op.stage(step.name, step.attrs...)
+		if err := step.run(ctx, sc); err != nil {
+			done = append(done, step) // include the failing step — see contract above
+			if rollbackErr := s.rollbackStartSteps(op, sc, done); rollbackErr != nil {
+				err = errors.Join(err, rollbackErr)
+			}
+			return err
+		}
+		done = append(done, step)
+	}
+	return nil
+}
+
+// rollbackStartSteps iterates completed steps in reverse, calling
+// each non-nil `undo` with a detached context — the original ctx
+// may already be cancelled (RPC client disconnect), but cleanup
+// still needs to run. Undo errors are joined together; one step's
+// failure doesn't short-circuit the rest.
+func (s *VMService) rollbackStartSteps(op *operationLog, sc *startContext, done []startStep) error {
+	var err error
+	for i := len(done) - 1; i >= 0; i-- {
+		step := done[i]
+		if step.undo == nil {
+			continue
+		}
+		op.stage("rollback_" + step.name)
+		if undoErr := step.undo(context.Background(), sc); undoErr != nil {
+			err = errors.Join(err, fmt.Errorf("rollback %s: %w", step.name, undoErr))
+		}
+	}
+	return err
+}
+
+// buildStartSteps returns the ordered list of phases startVMLocked
+// drives. Keeping the list as data (vs. a long linear method body)
+// makes the phase inventory diff-readable and lets a test driver
+// substitute its own step slice.
+//
+// Phase names MUST stay 1:1 with the prior inline version — they
+// appear in daemon logs, smoke-log greps, and the async-create
+// progress stream that clients read.
+func (s *VMService) buildStartSteps(op *operationLog, sc *startContext) []startStep {
+	return []startStep{
+		{
+			name:         "preflight",
+			createStage:  "preflight",
+			createDetail: "checking host prerequisites",
+			run: func(ctx context.Context, sc *startContext) error {
+				if err := s.validateStartPrereqs(ctx, *sc.vm, sc.image); err != nil {
+					return err
+				}
+				return os.MkdirAll(sc.vm.Runtime.VMDir, 0o755)
+			},
+		},
+		{
+			name: "cleanup_runtime",
+			run: func(ctx context.Context, sc *startContext) error {
+				if err := s.cleanupRuntime(ctx, *sc.vm, true); err != nil {
+					return err
+				}
+				s.clearVMHandles(*sc.vm)
+				return nil
+			},
+		},
+		{
+			name: "bridge",
+			run: func(ctx context.Context, _ *startContext) error {
+				return s.net.ensureBridge(ctx)
+			},
+		},
+		{
+			name: "socket_dir",
+			run: func(_ context.Context, _ *startContext) error {
+				return s.net.ensureSocketDir()
+			},
+		},
+		{
+			// prepare_sockets is a new op.stage label — the prior
+			// inline code ran these `os.RemoveAll` calls before the
+			// system_overlay stage without a stage marker. Keeping a
+			// distinct name makes the log trace and rollback (if any
+			// later step fails) unambiguous.
+			name: "prepare_sockets",
+			run: func(_ context.Context, sc *startContext) error {
+				if err := os.RemoveAll(sc.apiSock); err != nil && !os.IsNotExist(err) {
+					return err
+				}
+				if err := os.RemoveAll(sc.vm.Runtime.VSockPath); err != nil && !os.IsNotExist(err) {
+					return err
+				}
+				return nil
+			},
+		},
+		{
+			name:         "system_overlay",
+			attrs:        []any{"overlay_path", sc.vm.Runtime.SystemOverlay},
+			createStage:  "prepare_rootfs",
+			createDetail: "preparing system overlay",
+			run: func(ctx context.Context, sc *startContext) error {
+				// Record ownership BEFORE the call so a partial-truncate
+				// failure still triggers cleanup of the half-created file.
+				if !exists(sc.vm.Runtime.SystemOverlay) {
+					sc.systemOverlayCreated = true
+				}
+				return s.ensureSystemOverlay(ctx, sc.vm)
+			},
+			undo: func(_ context.Context, sc *startContext) error {
+				if !sc.systemOverlayCreated {
+					return nil
+				}
+				if err := os.Remove(sc.vm.Runtime.SystemOverlay); err != nil && !os.IsNotExist(err) {
+					return err
+				}
+				return nil
+			},
+		},
+		{
+			name:         "dm_snapshot",
+			attrs:        []any{"dm_name", sc.dmName},
+			createStage:  "prepare_rootfs",
+			createDetail: "creating root filesystem snapshot",
+			run: func(ctx context.Context, sc *startContext) error {
+				snapHandles, err := s.net.createDMSnapshot(ctx, sc.image.RootfsPath, sc.vm.Runtime.SystemOverlay, sc.dmName)
+				if err != nil {
+					// createDMSnapshot cleans up its own partial state on
+					// err; leave sc.live zero so the undo is a no-op.
+					return err
+				}
+				sc.live.BaseLoop = snapHandles.BaseLoop
+				sc.live.COWLoop = snapHandles.COWLoop
+				sc.live.DMName = snapHandles.DMName
+				sc.live.DMDev = snapHandles.DMDev
+				s.setVMHandles(*sc.vm, *sc.live)
+				// Fields that used to land next to the (now-deleted)
+				// cleanupOnErr closure. They belong with the DM
+				// snapshot because that's the first step producing
+				// runtime identity the downstream code reads back.
+				sc.vm.Runtime.APISockPath = sc.apiSock
+				sc.vm.Runtime.State = model.VMStateRunning
+				sc.vm.State = model.VMStateRunning
+				sc.vm.Runtime.LastError = ""
+				return nil
+			},
+			undo: func(ctx context.Context, sc *startContext) error {
+				if sc.live.DMName == "" && sc.live.BaseLoop == "" && sc.live.COWLoop == "" {
+					return nil
+				}
+				return s.net.cleanupDMSnapshot(ctx, dmSnapshotHandles{
+					BaseLoop: sc.live.BaseLoop,
+					COWLoop:  sc.live.COWLoop,
+					DMName:   sc.live.DMName,
+					DMDev:    sc.live.DMDev,
+				})
+			},
+		},
+		{
+			// See the comment in the prior inline version: stale
+			// bitmaps from a reused COW make e2cp/e2rm refuse to
+			// touch the snapshot. e2fsck -fy is a no-op on a fresh
+			// snapshot. Exit codes 0 + 1 are both "ok" here.
+			name: "fsck_snapshot",
+			run: func(ctx context.Context, sc *startContext) error {
+				if _, err := s.runner.RunSudo(ctx, "e2fsck", "-fy", sc.live.DMDev); err != nil {
+					if code := system.ExitCode(err); code < 0 || code > 1 {
+						return fmt.Errorf("fsck snapshot: %w", err)
+					}
+				}
+				return nil
+			},
+		},
+		{
+			name:         "patch_root_overlay",
+			createStage:  "prepare_rootfs",
+			createDetail: "writing guest configuration",
+			run: func(ctx context.Context, sc *startContext) error {
+				return s.patchRootOverlay(ctx, *sc.vm, sc.image)
+			},
+		},
+		{
+			name:         "prepare_host_features",
+			createStage:  "prepare_host_features",
+			createDetail: "preparing host-side vm features",
+			run: func(ctx context.Context, sc *startContext) error {
+				return s.capHooks.prepareHosts(ctx, sc.vm, sc.image)
+			},
+			// On err, prepareHosts already cleaned up the prefix that
+			// succeeded before the failing capability. On success, any
+			// LATER step failure triggers this undo, which tears down
+			// ALL prepared caps via their Cleanup hooks.
+			undo: func(ctx context.Context, sc *startContext) error {
+				return s.capHooks.cleanupState(ctx, *sc.vm)
+			},
+		},
+		{
+			name: "tap",
+			run: func(ctx context.Context, sc *startContext) error {
+				tap, err := s.net.acquireTap(ctx, sc.tapName)
+				if err != nil {
+					return err
+				}
+				sc.live.TapDevice = tap
+				s.setVMHandles(*sc.vm, *sc.live)
+				// Mirror onto VM.Runtime for NAT teardown resilience
+				// across daemon crashes — see vm.Runtime.TapDevice docs.
+				sc.vm.Runtime.TapDevice = tap
+				return nil
+			},
+			undo: func(ctx context.Context, sc *startContext) error {
+				if sc.live.TapDevice == "" {
+					return nil
+				}
+				return s.net.releaseTap(ctx, sc.live.TapDevice)
+			},
+		},
+		{
+			name:  "metrics_file",
+			attrs: []any{"metrics_path", sc.vm.Runtime.MetricsPath},
+			run: func(_ context.Context, sc *startContext) error {
+				return os.WriteFile(sc.vm.Runtime.MetricsPath, nil, 0o644)
+			},
+			undo: func(_ context.Context, sc *startContext) error {
+				if err := os.Remove(sc.vm.Runtime.MetricsPath); err != nil && !os.IsNotExist(err) {
+					return err
+				}
+				return nil
+			},
+		},
+		{
+			name: "firecracker_binary",
+			run: func(_ context.Context, sc *startContext) error {
+				fcPath, err := s.net.firecrackerBinary()
+				if err != nil {
+					return err
+				}
+				sc.fcPath = fcPath
+				return nil
+			},
+		},
+		{
+			name:         "firecracker_launch",
+			attrs:        []any{"log_path", sc.vm.Runtime.LogPath, "metrics_path", sc.vm.Runtime.MetricsPath},
+			createStage:  "boot_firecracker",
+			createDetail: "starting firecracker",
+			run: func(ctx context.Context, sc *startContext) error {
+				kernelArgs := buildKernelArgs(*sc.vm, sc.image, s.config.BridgeIP, s.config.DefaultDNS)
+				machineConfig := firecracker.MachineConfig{
+					BinaryPath:      sc.fcPath,
+					VMID:            sc.vm.ID,
+					SocketPath:      sc.apiSock,
+					LogPath:         sc.vm.Runtime.LogPath,
+					MetricsPath:     sc.vm.Runtime.MetricsPath,
+					KernelImagePath: sc.image.KernelPath,
+					InitrdPath:      sc.image.InitrdPath,
+					KernelArgs:      kernelArgs,
+					Drives: []firecracker.DriveConfig{{
+						ID:       "rootfs",
+						Path:     sc.live.DMDev,
+						ReadOnly: false,
+						IsRoot:   true,
+					}},
+					TapDevice: sc.live.TapDevice,
+					VSockPath: sc.vm.Runtime.VSockPath,
+					VSockCID:  sc.vm.Runtime.VSockCID,
+					VCPUCount: sc.vm.Spec.VCPUCount,
+					MemoryMiB: sc.vm.Spec.MemoryMiB,
+					Logger:    s.logger,
+				}
+				s.capHooks.contributeMachine(&machineConfig, *sc.vm, sc.image)
+				machine, err := firecracker.NewMachine(ctx, machineConfig)
+				if err != nil {
+					return err
+				}
+				sc.machine = machine
+				if err := machine.Start(ctx); err != nil {
+					// machine.Start can fail AFTER the firecracker process
+					// is already spawned (HTTP config phase). Record the
+					// PID so the undo can kill it; use a fresh ctx since
+					// the request ctx may be cancelled by now.
+					sc.live.PID = s.net.resolveFirecrackerPID(context.Background(), machine, sc.apiSock)
+					s.setVMHandles(*sc.vm, *sc.live)
+					return err
+				}
+				sc.live.PID = s.net.resolveFirecrackerPID(context.Background(), machine, sc.apiSock)
+				s.setVMHandles(*sc.vm, *sc.live)
+				op.debugStage("firecracker_started", "pid", sc.live.PID)
+				return nil
+			},
+			undo: func(ctx context.Context, sc *startContext) error {
+				var errs []error
+				if sc.live.PID > 0 {
+					if err := s.net.killVMProcess(ctx, sc.live.PID); err != nil {
+						errs = append(errs, err)
+					}
+				}
+				if err := os.Remove(sc.apiSock); err != nil && !os.IsNotExist(err) {
+					errs = append(errs, err)
+				}
+				if err := os.Remove(sc.vm.Runtime.VSockPath); err != nil && !os.IsNotExist(err) {
+					errs = append(errs, err)
+				}
+				return errors.Join(errs...)
+			},
+		},
+		{
+			name:  "socket_access",
+			attrs: []any{"api_socket", sc.apiSock},
+			run: func(ctx context.Context, sc *startContext) error {
+				return s.net.ensureSocketAccess(ctx, sc.apiSock, "firecracker api socket")
+			},
+		},
+		{
+			name:  "vsock_access",
+			attrs: []any{"vsock_path", sc.vm.Runtime.VSockPath, "vsock_cid", sc.vm.Runtime.VSockCID},
+			run: func(ctx context.Context, sc *startContext) error {
+				return s.net.ensureSocketAccess(ctx, sc.vm.Runtime.VSockPath, "firecracker vsock socket")
+			},
+		},
+		{
+			name:         "wait_vsock_agent",
+			createStage:  "wait_vsock_agent",
+			createDetail: "waiting for guest vsock agent",
+			run: func(ctx context.Context, sc *startContext) error {
+				return s.net.waitForGuestVSockAgent(ctx, sc.vm.Runtime.VSockPath, vsockReadyWait)
+			},
+		},
+		{
+			name:         "post_start_features",
+			createStage:  "wait_guest_ready",
+			createDetail: "waiting for guest services",
+			run: func(ctx context.Context, sc *startContext) error {
+				return s.capHooks.postStart(ctx, *sc.vm, sc.image)
+			},
+			// Capability Cleanup hooks are designed to be idempotent
+			// (check feature-enabled flag, no-op if nothing to undo),
+			// so calling cleanupState here is safe whether postStart
+			// reached every cap or bailed midway.
+			undo: func(ctx context.Context, sc *startContext) error {
+				return s.capHooks.cleanupState(ctx, *sc.vm)
+			},
+		},
+		{
+			name:         "persist",
+			createStage:  "finalize",
+			createDetail: "saving vm state",
+			run: func(ctx context.Context, sc *startContext) error {
+				system.TouchNow(sc.vm)
+				return s.store.UpsertVM(ctx, *sc.vm)
+			},
+		},
+	}
+}
--- a/internal/daemon/vm_lifecycle_steps_test.go
+++ b/internal/daemon/vm_lifecycle_steps_test.go
@ -0,0 +1,164 @@
+package daemon
+
+import (
+	"context"
+	"errors"
+	"io"
+	"log/slog"
+	"strings"
+	"testing"
+)
+
+// TestRunStartSteps_RollsBackInReverseOnFailure pins the driver
+// contract at the heart of commit 1's refactor: on a step failure
+// (a) every step that succeeded BEFORE the failing one gets its
+// undo fired in reverse order; (b) the failing step's undo also
+// fires, because steps may acquire partial state before returning
+// err; (c) the final error wraps both the run error and any
+// rollback errors via errors.Join.
+func TestRunStartSteps_RollsBackInReverseOnFailure(t *testing.T) {
+	s := &VMService{}
+	op := &operationLog{logger: slog.New(slog.NewTextHandler(io.Discard, nil))}
+	sc := &startContext{}
+
+	var events []string
+	record := func(label string) func(context.Context, *startContext) error {
+		return func(context.Context, *startContext) error {
+			events = append(events, label)
+			return nil
+		}
+	}
+	recordErr := func(label string, err error) func(context.Context, *startContext) error {
+		return func(context.Context, *startContext) error {
+			events = append(events, label)
+			return err
+		}
+	}
+
+	steps := []startStep{
+		{name: "first", run: record("run-first"), undo: record("undo-first")},
+		{name: "second", run: record("run-second"), undo: record("undo-second")},
+		{name: "third", run: recordErr("run-third", errors.New("boom")), undo: record("undo-third")},
+		{name: "fourth", run: record("run-fourth"), undo: record("undo-fourth")},
+	}
+
+	err := s.runStartSteps(context.Background(), op, sc, steps)
+	if err == nil || !strings.Contains(err.Error(), "boom") {
+		t.Fatalf("runStartSteps err = %v, want containing 'boom'", err)
+	}
+
+	want := []string{
+		// Forward run: first, second, third (fails — fourth never runs).
+		"run-first", "run-second", "run-third",
+		// Reverse undo: third, second, first. Fourth never ran so no undo-fourth.
+		"undo-third", "undo-second", "undo-first",
+	}
+	if len(events) != len(want) {
+		t.Fatalf("events length = %d, want %d:\n  got:  %v\n  want: %v", len(events), len(want), events, want)
+	}
+	for i := range want {
+		if events[i] != want[i] {
+			t.Fatalf("events[%d] = %q, want %q\n  got:  %v\n  want: %v", i, events[i], want[i], events, want)
+		}
+	}
+}
+
+// TestRunStartSteps_SkipsNilUndos proves the optional-undo contract:
+// steps without teardown obligations leave `undo` nil and the driver
+// must silently skip them during rollback rather than panicking.
+func TestRunStartSteps_SkipsNilUndos(t *testing.T) {
+	s := &VMService{}
+	op := &operationLog{logger: slog.New(slog.NewTextHandler(io.Discard, nil))}
+	sc := &startContext{}
+
+	var undoCalls []string
+	undo := func(label string) func(context.Context, *startContext) error {
+		return func(context.Context, *startContext) error {
+			undoCalls = append(undoCalls, label)
+			return nil
+		}
+	}
+	noop := func(context.Context, *startContext) error { return nil }
+
+	steps := []startStep{
+		{name: "has-undo", run: noop, undo: undo("has-undo")},
+		{name: "no-undo", run: noop}, // undo nil intentionally
+		{name: "failing", run: func(context.Context, *startContext) error { return errors.New("x") }, undo: undo("failing")},
+	}
+
+	if err := s.runStartSteps(context.Background(), op, sc, steps); err == nil {
+		t.Fatal("runStartSteps err = nil, want failure")
+	}
+
+	// Rollback order: failing (acquired state, so its undo runs), no-undo
+	// (skipped — nil), has-undo.
+	want := []string{"failing", "has-undo"}
+	if len(undoCalls) != len(want) || undoCalls[0] != want[0] || undoCalls[1] != want[1] {
+		t.Fatalf("undo calls = %v, want %v", undoCalls, want)
+	}
+}
+
+// TestRunStartSteps_JoinsRollbackErrors asserts that undo errors are
+// joined onto the original run error rather than hiding it — the
+// caller must always see the root cause ("boom") even when the
+// rollback path itself is messy.
+func TestRunStartSteps_JoinsRollbackErrors(t *testing.T) {
+	s := &VMService{}
+	op := &operationLog{logger: slog.New(slog.NewTextHandler(io.Discard, nil))}
+	sc := &startContext{}
+
+	rootErr := errors.New("boom")
+	undoErr := errors.New("undo-fail")
+
+	steps := []startStep{
+		{
+			name: "ok",
+			run:  func(context.Context, *startContext) error { return nil },
+			undo: func(context.Context, *startContext) error { return undoErr },
+		},
+		{
+			name: "fail",
+			run:  func(context.Context, *startContext) error { return rootErr },
+		},
+	}
+
+	err := s.runStartSteps(context.Background(), op, sc, steps)
+	if err == nil {
+		t.Fatal("err = nil, want joined error")
+	}
+	if !errors.Is(err, rootErr) {
+		t.Fatalf("err does not wrap rootErr; got: %v", err)
+	}
+	if !errors.Is(err, undoErr) {
+		t.Fatalf("err does not wrap undoErr; got: %v", err)
+	}
+}
+
+// TestRunStartSteps_HappyPathNoRollback confirms that when every
+// step's run returns nil, no undo fires — rollback is strictly a
+// failure-path concern.
+func TestRunStartSteps_HappyPathNoRollback(t *testing.T) {
+	s := &VMService{}
+	op := &operationLog{logger: slog.New(slog.NewTextHandler(io.Discard, nil))}
+	sc := &startContext{}
+
+	var undoCalled bool
+	steps := []startStep{
+		{
+			name: "a",
+			run:  func(context.Context, *startContext) error { return nil },
+			undo: func(context.Context, *startContext) error { undoCalled = true; return nil },
+		},
+		{
+			name: "b",
+			run:  func(context.Context, *startContext) error { return nil },
+		},
+	}
+
+	if err := s.runStartSteps(context.Background(), op, sc, steps); err != nil {
+		t.Fatalf("runStartSteps err = %v, want nil", err)
+	}
+	if undoCalled {
+		t.Fatal("undo fired on happy path — rollback must only run on failure")
+	}
+}