daemon: extract startVMLocked into step runner with per-step rollback

startVMLocked was a ~260-line method running 18 sequential phases with one lumped error path: on any failure, cleanupOnErr called cleanupRuntime — a catch-all teardown that didn't distinguish "this phase acquired resources we should undo" from "this phase is idempotent." The blast radius was the entire VM lifecycle. Every tweak to boot, NAT, disk, or auth-sync orchestration had to reason about a closure that could fire at any of 18 points. This commit extracts the phases into a data-driven pipeline: - startContext threads the mutable state (vm, live, apiSock, dmName, tapName, etc.) through every step by pointer so step bodies mutate in place without returning copies. - startStep carries the op.stage name, optional vmCreateStage progress ping, optional log attrs, a run closure, and an optional undo closure. - runStartSteps walks steps in order, appends the failing step to the rollback set (so partial-acquire failures like machine.Start's post-spawn HTTP config get their undo fired), then iterates the rollback set in reverse and joins errors via errors.Join. Each phase that acquires a resource now owns its own undo: system_overlay removes a file it created, dm_snapshot cleans up the loop + DM handles it set, prepare_host_features delegates to capHooks.cleanupState, tap releases via releaseTap, metrics_file removes the file, firecracker_launch kills the spawned PID and drops the sockets, post_start_features calls capHooks.cleanupState again (capability Cleanup hooks are idempotent — safe to call whether PostStart reached every cap or not). The 11 phases with no teardown obligation leave `undo` nil and the driver silently skips them on rollback. cleanupRuntime is retired from the start-failure path. It stays intact for reconcile, stopVMLocked, killVMLocked, deleteVMLocked, stopStaleVMs — the crash-recovery / lifecycle-teardown contract those paths rely on is unchanged. startVMLocked shrinks from ~225 lines of sequential-phase code plus a cleanupOnErr closure to ~45 lines: compute derived paths, build the step list, drive it, persist ERROR state on failure. Stage names preserved 1:1 so existing log grep + the async-create progress stream stay compatible. Tests: - TestRunStartSteps_RollsBackInReverseOnFailure — the contract is pinned: succeeded-before-failing run, all their undos in reverse, failing step's undo also fires, original err still visible via errors.Is. - TestRunStartSteps_SkipsNilUndos — optional-undo contract. - TestRunStartSteps_JoinsRollbackErrors — undo failures don't hide the root cause. - TestRunStartSteps_HappyPathNoRollback — success path never fires any undo. Smoke: all 21 scenarios pass, including the start-path ones (bare vm run, workspace vm run, vm restart, vm lifecycle, vm set reconfig) that exercise real firecracker boots end-to-end. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-23 15:34:34 -03:00 · 2026-04-23 15:34:34 -03:00 · 11a33604c0
commit 11a33604c0
parent 2ebd2b64bb
3 changed files with 623 additions and 194 deletions
--- a/internal/daemon/vm_lifecycle.go
+++ b/internal/daemon/vm_lifecycle.go
@ -3,7 +3,6 @@ package daemon
 import (
 	"context"
 	"errors"
-	"fmt"
 	"os"
 	"path/filepath"
 	"strconv"
@ -11,8 +10,6 @@ import (
 	"time"

 	"banger/internal/api"
-	"banger/internal/firecracker"
-	"banger/internal/imagepull"
 	"banger/internal/model"
 	"banger/internal/system"
 )
@ -43,28 +40,10 @@ func (s *VMService) startVMLocked(ctx context.Context, vm model.VMRecord, image
 		}
 		op.done(vmLogAttrs(vm)...)
 	}()
-	op.stage("preflight")
-	vmCreateStage(ctx, "preflight", "checking host prerequisites")
-	if err := s.validateStartPrereqs(ctx, vm, image); err != nil {
-		return model.VMRecord{}, err
-	}
-	if err := os.MkdirAll(vm.Runtime.VMDir, 0o755); err != nil {
-		return model.VMRecord{}, err
-	}
-	op.stage("cleanup_runtime")
-	if err := s.cleanupRuntime(ctx, vm, true); err != nil {
-		return model.VMRecord{}, err
-	}
-	s.clearVMHandles(vm)
-	op.stage("bridge")
-	if err := s.net.ensureBridge(ctx); err != nil {
-		return model.VMRecord{}, err
-	}
-	op.stage("socket_dir")
-	if err := s.net.ensureSocketDir(); err != nil {
-		return model.VMRecord{}, err
-	}

+	// Derive per-VM paths/names up front so every step sees the same
+	// values. Shortening vm.ID mirrors how the pre-refactor inline
+	// code did it.
 	shortID := system.ShortID(vm.ID)
 	apiSock := filepath.Join(s.layout.RuntimeDir, "fc-"+shortID+".sock")
 	dmName := "fc-rootfs-" + shortID
@ -78,183 +57,35 @@ func (s *VMService) startVMLocked(ctx context.Context, vm model.VMRecord, image
 			return model.VMRecord{}, err
 		}
 	}
-	if err := os.RemoveAll(apiSock); err != nil && !os.IsNotExist(err) {
-		return model.VMRecord{}, err
-	}
-	if err := os.RemoveAll(vm.Runtime.VSockPath); err != nil && !os.IsNotExist(err) {
-		return model.VMRecord{}, err
+
+	live := model.VMHandles{}
+	sc := &startContext{
+		vm:      &vm,
+		image:   image,
+		live:    &live,
+		apiSock: apiSock,
+		dmName:  dmName,
+		tapName: tapName,
 	}

-	op.stage("system_overlay", "overlay_path", vm.Runtime.SystemOverlay)
-	vmCreateStage(ctx, "prepare_rootfs", "preparing system overlay")
-	if err := s.ensureSystemOverlay(ctx, &vm); err != nil {
-		return model.VMRecord{}, err
-	}
-
-	op.stage("dm_snapshot", "dm_name", dmName)
-	vmCreateStage(ctx, "prepare_rootfs", "creating root filesystem snapshot")
-	snapHandles, err := s.net.createDMSnapshot(ctx, image.RootfsPath, vm.Runtime.SystemOverlay, dmName)
-	if err != nil {
-		return model.VMRecord{}, err
-	}
-	// Live handles are threaded through this function as a local and
-	// pushed to the cache via setVMHandles once we have every piece.
-	// The cache update must happen BEFORE any step that reads handles
-	// back (e.g. cleanupRuntime via cleanupOnErr) — otherwise loops
-	// and DM would leak on an early failure.
-	live := model.VMHandles{
-		BaseLoop: snapHandles.BaseLoop,
-		COWLoop:  snapHandles.COWLoop,
-		DMName:   snapHandles.DMName,
-		DMDev:    snapHandles.DMDev,
-	}
-	s.setVMHandles(vm, live)
-
-	vm.Runtime.APISockPath = apiSock
-	vm.Runtime.State = model.VMStateRunning
-	vm.State = model.VMStateRunning
-	vm.Runtime.LastError = ""
-
-	cleanupOnErr := func(err error) (model.VMRecord, error) {
+	if runErr := s.runStartSteps(ctx, op, sc, s.buildStartSteps(op, sc)); runErr != nil {
+		// The step driver already ran rollback in reverse for every
+		// succeeded step. All that's left is to persist the ERROR
+		// state so operators see the failure via `vm show`. Use a
+		// fresh context in case the request ctx is cancelled — DB
+		// writes past this point are recovery, not user-driven.
+		//
+		// The store check is for tests that construct a bare Daemon
+		// without a DB; production always has s.store non-nil.
 		vm.State = model.VMStateError
 		vm.Runtime.State = model.VMStateError
-		vm.Runtime.LastError = err.Error()
-		op.stage("cleanup_after_failure", "error", err.Error())
-		if cleanupErr := s.cleanupRuntime(context.Background(), vm, true); cleanupErr != nil {
-			err = errors.Join(err, cleanupErr)
-		}
+		vm.Runtime.LastError = runErr.Error()
 		vm.Runtime.TapDevice = ""
 		s.clearVMHandles(vm)
-		_ = s.store.UpsertVM(context.Background(), vm)
-		return model.VMRecord{}, err
-	}
-
-	// On a restart the COW already holds writes from a previous guest
-	// boot — stale free-inode / free-block counters, possibly unwritten
-	// journal updates. e2fsprogs (e2cp/e2rm, used by patchRootOverlay)
-	// refuses to touch the snapshot with "Inode bitmap checksum does
-	// not match bitmap", which bubbles up as a "start failed" even
-	// though the filesystem is kernel-valid. `e2fsck -fy` reconciles
-	// the bitmaps and is a no-op on a fresh snapshot, so running it
-	// unconditionally keeps the code path the same for first vs.
-	// subsequent starts. Exit code 1 means "errors fixed" — we treat
-	// that as success.
-	op.stage("fsck_snapshot")
-	if _, err := s.runner.RunSudo(ctx, "e2fsck", "-fy", live.DMDev); err != nil {
-		// e2fsck exit codes: 0=clean, 1=errors corrected, 2=reboot
-		// needed, 4+=uncorrected. -1 means the error wasn't an
-		// exec.ExitError (e.g. command not found, ctx cancel).
-		if code := system.ExitCode(err); code < 0 || code > 1 {
-			return cleanupOnErr(fmt.Errorf("fsck snapshot: %w", err))
+		if s.store != nil {
+			_ = s.store.UpsertVM(context.Background(), vm)
 		}
-	}
-
-	op.stage("patch_root_overlay")
-	vmCreateStage(ctx, "prepare_rootfs", "writing guest configuration")
-	if err := s.patchRootOverlay(ctx, vm, image); err != nil {
-		return cleanupOnErr(err)
-	}
-	op.stage("prepare_host_features")
-	vmCreateStage(ctx, "prepare_host_features", "preparing host-side vm features")
-	if err := s.capHooks.prepareHosts(ctx, &vm, image); err != nil {
-		return cleanupOnErr(err)
-	}
-	op.stage("tap")
-	tap, err := s.net.acquireTap(ctx, tapName)
-	if err != nil {
-		return cleanupOnErr(err)
-	}
-	live.TapDevice = tap
-	s.setVMHandles(vm, live)
-	// Mirror onto VM.Runtime so NAT teardown can recover the tap
-	// name from the DB even if the handle cache is empty (daemon
-	// crash + restart, corrupt handles.json).
-	vm.Runtime.TapDevice = tap
-	op.stage("metrics_file", "metrics_path", vm.Runtime.MetricsPath)
-	if err := os.WriteFile(vm.Runtime.MetricsPath, nil, 0o644); err != nil {
-		return cleanupOnErr(err)
-	}
-
-	op.stage("firecracker_binary")
-	fcPath, err := s.net.firecrackerBinary()
-	if err != nil {
-		return cleanupOnErr(err)
-	}
-	op.stage("firecracker_launch", "log_path", vm.Runtime.LogPath, "metrics_path", vm.Runtime.MetricsPath)
-	vmCreateStage(ctx, "boot_firecracker", "starting firecracker")
-	kernelArgs := system.BuildBootArgs(vm.Name)
-	if strings.TrimSpace(image.InitrdPath) == "" {
-		// Direct-boot image (no initramfs) — the rootfs may be a
-		// container image without /sbin/init or iproute2. Use:
-		// 1. Kernel-level IP config via ip= cmdline (CONFIG_IP_PNP),
-		//    so the network is up before init runs — no ip(8) needed.
-		// 2. init= pointing at our universal wrapper which installs
-		//    systemd+sshd on first boot if missing.
-		kernelArgs = system.BuildBootArgsWithKernelIP(
-			vm.Name, vm.Runtime.GuestIP, s.config.BridgeIP, s.config.DefaultDNS,
-		) + " init=" + imagepull.FirstBootScriptPath
-	}
-
-	machineConfig := firecracker.MachineConfig{
-		BinaryPath:      fcPath,
-		VMID:            vm.ID,
-		SocketPath:      apiSock,
-		LogPath:         vm.Runtime.LogPath,
-		MetricsPath:     vm.Runtime.MetricsPath,
-		KernelImagePath: image.KernelPath,
-		InitrdPath:      image.InitrdPath,
-		KernelArgs:      kernelArgs,
-		Drives: []firecracker.DriveConfig{{
-			ID:       "rootfs",
-			Path:     live.DMDev,
-			ReadOnly: false,
-			IsRoot:   true,
-		}},
-		TapDevice: tap,
-		VSockPath: vm.Runtime.VSockPath,
-		VSockCID:  vm.Runtime.VSockCID,
-		VCPUCount: vm.Spec.VCPUCount,
-		MemoryMiB: vm.Spec.MemoryMiB,
-		Logger:    s.logger,
-	}
-	s.capHooks.contributeMachine(&machineConfig, vm, image)
-	machine, err := firecracker.NewMachine(ctx, machineConfig)
-	if err != nil {
-		return cleanupOnErr(err)
-	}
-	if err := machine.Start(ctx); err != nil {
-		// Use a fresh context: the request ctx may already be cancelled (client
-		// disconnect), but we still need the PID so cleanupRuntime can kill the
-		// Firecracker process that was spawned before the failure.
-		live.PID = s.net.resolveFirecrackerPID(context.Background(), machine, apiSock)
-		s.setVMHandles(vm, live)
-		return cleanupOnErr(err)
-	}
-	live.PID = s.net.resolveFirecrackerPID(context.Background(), machine, apiSock)
-	s.setVMHandles(vm, live)
-	op.debugStage("firecracker_started", "pid", live.PID)
-	op.stage("socket_access", "api_socket", apiSock)
-	if err := s.net.ensureSocketAccess(ctx, apiSock, "firecracker api socket"); err != nil {
-		return cleanupOnErr(err)
-	}
-	op.stage("vsock_access", "vsock_path", vm.Runtime.VSockPath, "vsock_cid", vm.Runtime.VSockCID)
-	if err := s.net.ensureSocketAccess(ctx, vm.Runtime.VSockPath, "firecracker vsock socket"); err != nil {
-		return cleanupOnErr(err)
-	}
-	vmCreateStage(ctx, "wait_vsock_agent", "waiting for guest vsock agent")
-	if err := s.net.waitForGuestVSockAgent(ctx, vm.Runtime.VSockPath, vsockReadyWait); err != nil {
-		return cleanupOnErr(err)
-	}
-	op.stage("post_start_features")
-	vmCreateStage(ctx, "wait_guest_ready", "waiting for guest services")
-	if err := s.capHooks.postStart(ctx, vm, image); err != nil {
-		return cleanupOnErr(err)
-	}
-	system.TouchNow(&vm)
-	op.stage("persist")
-	vmCreateStage(ctx, "finalize", "saving vm state")
-	if err := s.store.UpsertVM(ctx, vm); err != nil {
-		return cleanupOnErr(err)
+		return model.VMRecord{}, runErr
 	}
 	return vm, nil
 }