startVMLocked was a ~260-line method running 18 sequential phases
with one lumped error path: on any failure, cleanupOnErr called
cleanupRuntime — a catch-all teardown that didn't distinguish
"this phase acquired resources we should undo" from "this phase is
idempotent." The blast radius was the entire VM lifecycle. Every
tweak to boot, NAT, disk, or auth-sync orchestration had to reason
about a closure that could fire at any of 18 points.
This commit extracts the phases into a data-driven pipeline:
- startContext threads the mutable state (vm, live, apiSock,
dmName, tapName, etc.) through every step by pointer so step
bodies mutate in place without returning copies.
- startStep carries the op.stage name, optional vmCreateStage
progress ping, optional log attrs, a run closure, and an
optional undo closure.
- runStartSteps walks steps in order, appends the failing step
to the rollback set (so partial-acquire failures like
machine.Start's post-spawn HTTP config get their undo fired),
then iterates the rollback set in reverse and joins errors
via errors.Join.
Each phase that acquires a resource now owns its own undo:
system_overlay removes a file it created, dm_snapshot cleans up
the loop + DM handles it set, prepare_host_features delegates to
capHooks.cleanupState, tap releases via releaseTap, metrics_file
removes the file, firecracker_launch kills the spawned PID and
drops the sockets, post_start_features calls capHooks.cleanupState
again (capability Cleanup hooks are idempotent — safe to call
whether PostStart reached every cap or not). The 11 phases with
no teardown obligation leave `undo` nil and the driver silently
skips them on rollback.
cleanupRuntime is retired from the start-failure path. It stays
intact for reconcile, stopVMLocked, killVMLocked, deleteVMLocked,
stopStaleVMs — the crash-recovery / lifecycle-teardown contract
those paths rely on is unchanged.
startVMLocked shrinks from ~225 lines of sequential-phase code
plus a cleanupOnErr closure to ~45 lines: compute derived paths,
build the step list, drive it, persist ERROR state on failure.
Stage names preserved 1:1 so existing log grep + the async-create
progress stream stay compatible.
Tests:
- TestRunStartSteps_RollsBackInReverseOnFailure — the contract
is pinned: succeeded-before-failing run, all their undos in
reverse, failing step's undo also fires, original err still
visible via errors.Is.
- TestRunStartSteps_SkipsNilUndos — optional-undo contract.
- TestRunStartSteps_JoinsRollbackErrors — undo failures don't
hide the root cause.
- TestRunStartSteps_HappyPathNoRollback — success path never
fires any undo.
Smoke: all 21 scenarios pass, including the start-path ones
(bare vm run, workspace vm run, vm restart, vm lifecycle, vm set
reconfig) that exercise real firecracker boots end-to-end.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
434 lines
15 KiB
Go
434 lines
15 KiB
Go
package daemon
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"os"
|
|
"strings"
|
|
|
|
"banger/internal/firecracker"
|
|
"banger/internal/imagepull"
|
|
"banger/internal/model"
|
|
"banger/internal/system"
|
|
)
|
|
|
|
// buildKernelArgs assembles the kernel command line for a start.
|
|
// Direct-boot images (no initrd) get kernel-level IP config so the
|
|
// network is up before init, plus init= pointing at the universal
|
|
// first-boot wrapper. Anything else uses the plain variant.
|
|
func buildKernelArgs(vm model.VMRecord, image model.Image, bridgeIP, defaultDNS string) string {
|
|
if strings.TrimSpace(image.InitrdPath) == "" {
|
|
return system.BuildBootArgsWithKernelIP(
|
|
vm.Name, vm.Runtime.GuestIP, bridgeIP, defaultDNS,
|
|
) + " init=" + imagepull.FirstBootScriptPath
|
|
}
|
|
return system.BuildBootArgs(vm.Name)
|
|
}
|
|
|
|
// startContext is the mutable state threaded through every start
|
|
// step. `vm` and `live` are pointers so steps mutate in place —
|
|
// dodges returning redundant copies and keeps step bodies readable.
|
|
// Values computed by `startVMLocked` before the driver runs
|
|
// (apiSock, dmName, tapName) live here too so each step can read
|
|
// them without rederiving.
|
|
type startContext struct {
|
|
vm *model.VMRecord
|
|
image model.Image
|
|
live *model.VMHandles
|
|
apiSock string
|
|
dmName string
|
|
tapName string
|
|
fcPath string
|
|
machine *firecracker.Machine
|
|
|
|
// systemOverlayCreated records whether the system_overlay step
|
|
// actually created the file (vs. the file existing from a crashed
|
|
// prior attempt). The undo honours it so a leftover-but-valid
|
|
// overlay isn't deleted under us.
|
|
systemOverlayCreated bool
|
|
}
|
|
|
|
// startStep is one phase in the start-VM pipeline. Phases with no
|
|
// rollback obligation leave `undo` nil — the driver simply skips
|
|
// them on the rollback path. `createStage` / `createDetail` are
|
|
// forwarded to `vmCreateStage` so the async-create RPC caller sees
|
|
// progress; they're "" for phases that were never part of the
|
|
// user-facing progress stream.
|
|
type startStep struct {
|
|
name string
|
|
attrs []any
|
|
createStage string
|
|
createDetail string
|
|
run func(ctx context.Context, sc *startContext) error
|
|
undo func(ctx context.Context, sc *startContext) error
|
|
}
|
|
|
|
// runStartSteps walks steps in order, logging each via `op.stage`
|
|
// (and `vmCreateStage` when the step opted in). On the first
|
|
// run-err, it rolls back the prefix (including the failing step, so
|
|
// a step that acquired resources before erroring gets its undo
|
|
// fired) and returns the original err joined with any rollback err.
|
|
//
|
|
// Contract: `undo` must be safe to call even when `run` returned
|
|
// an error — check zero-value guards rather than assuming success.
|
|
// This is cheaper than a two-phase acquire/commit per step and
|
|
// matches how `cleanupPreparedCapabilities` in capabilities.go
|
|
// treats partial-success rollback.
|
|
func (s *VMService) runStartSteps(ctx context.Context, op *operationLog, sc *startContext, steps []startStep) error {
|
|
done := make([]startStep, 0, len(steps))
|
|
for _, step := range steps {
|
|
if step.createStage != "" {
|
|
vmCreateStage(ctx, step.createStage, step.createDetail)
|
|
}
|
|
op.stage(step.name, step.attrs...)
|
|
if err := step.run(ctx, sc); err != nil {
|
|
done = append(done, step) // include the failing step — see contract above
|
|
if rollbackErr := s.rollbackStartSteps(op, sc, done); rollbackErr != nil {
|
|
err = errors.Join(err, rollbackErr)
|
|
}
|
|
return err
|
|
}
|
|
done = append(done, step)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// rollbackStartSteps iterates completed steps in reverse, calling
|
|
// each non-nil `undo` with a detached context — the original ctx
|
|
// may already be cancelled (RPC client disconnect), but cleanup
|
|
// still needs to run. Undo errors are joined together; one step's
|
|
// failure doesn't short-circuit the rest.
|
|
func (s *VMService) rollbackStartSteps(op *operationLog, sc *startContext, done []startStep) error {
|
|
var err error
|
|
for i := len(done) - 1; i >= 0; i-- {
|
|
step := done[i]
|
|
if step.undo == nil {
|
|
continue
|
|
}
|
|
op.stage("rollback_" + step.name)
|
|
if undoErr := step.undo(context.Background(), sc); undoErr != nil {
|
|
err = errors.Join(err, fmt.Errorf("rollback %s: %w", step.name, undoErr))
|
|
}
|
|
}
|
|
return err
|
|
}
|
|
|
|
// buildStartSteps returns the ordered list of phases startVMLocked
|
|
// drives. Keeping the list as data (vs. a long linear method body)
|
|
// makes the phase inventory diff-readable and lets a test driver
|
|
// substitute its own step slice.
|
|
//
|
|
// Phase names MUST stay 1:1 with the prior inline version — they
|
|
// appear in daemon logs, smoke-log greps, and the async-create
|
|
// progress stream that clients read.
|
|
func (s *VMService) buildStartSteps(op *operationLog, sc *startContext) []startStep {
|
|
return []startStep{
|
|
{
|
|
name: "preflight",
|
|
createStage: "preflight",
|
|
createDetail: "checking host prerequisites",
|
|
run: func(ctx context.Context, sc *startContext) error {
|
|
if err := s.validateStartPrereqs(ctx, *sc.vm, sc.image); err != nil {
|
|
return err
|
|
}
|
|
return os.MkdirAll(sc.vm.Runtime.VMDir, 0o755)
|
|
},
|
|
},
|
|
{
|
|
name: "cleanup_runtime",
|
|
run: func(ctx context.Context, sc *startContext) error {
|
|
if err := s.cleanupRuntime(ctx, *sc.vm, true); err != nil {
|
|
return err
|
|
}
|
|
s.clearVMHandles(*sc.vm)
|
|
return nil
|
|
},
|
|
},
|
|
{
|
|
name: "bridge",
|
|
run: func(ctx context.Context, _ *startContext) error {
|
|
return s.net.ensureBridge(ctx)
|
|
},
|
|
},
|
|
{
|
|
name: "socket_dir",
|
|
run: func(_ context.Context, _ *startContext) error {
|
|
return s.net.ensureSocketDir()
|
|
},
|
|
},
|
|
{
|
|
// prepare_sockets is a new op.stage label — the prior
|
|
// inline code ran these `os.RemoveAll` calls before the
|
|
// system_overlay stage without a stage marker. Keeping a
|
|
// distinct name makes the log trace and rollback (if any
|
|
// later step fails) unambiguous.
|
|
name: "prepare_sockets",
|
|
run: func(_ context.Context, sc *startContext) error {
|
|
if err := os.RemoveAll(sc.apiSock); err != nil && !os.IsNotExist(err) {
|
|
return err
|
|
}
|
|
if err := os.RemoveAll(sc.vm.Runtime.VSockPath); err != nil && !os.IsNotExist(err) {
|
|
return err
|
|
}
|
|
return nil
|
|
},
|
|
},
|
|
{
|
|
name: "system_overlay",
|
|
attrs: []any{"overlay_path", sc.vm.Runtime.SystemOverlay},
|
|
createStage: "prepare_rootfs",
|
|
createDetail: "preparing system overlay",
|
|
run: func(ctx context.Context, sc *startContext) error {
|
|
// Record ownership BEFORE the call so a partial-truncate
|
|
// failure still triggers cleanup of the half-created file.
|
|
if !exists(sc.vm.Runtime.SystemOverlay) {
|
|
sc.systemOverlayCreated = true
|
|
}
|
|
return s.ensureSystemOverlay(ctx, sc.vm)
|
|
},
|
|
undo: func(_ context.Context, sc *startContext) error {
|
|
if !sc.systemOverlayCreated {
|
|
return nil
|
|
}
|
|
if err := os.Remove(sc.vm.Runtime.SystemOverlay); err != nil && !os.IsNotExist(err) {
|
|
return err
|
|
}
|
|
return nil
|
|
},
|
|
},
|
|
{
|
|
name: "dm_snapshot",
|
|
attrs: []any{"dm_name", sc.dmName},
|
|
createStage: "prepare_rootfs",
|
|
createDetail: "creating root filesystem snapshot",
|
|
run: func(ctx context.Context, sc *startContext) error {
|
|
snapHandles, err := s.net.createDMSnapshot(ctx, sc.image.RootfsPath, sc.vm.Runtime.SystemOverlay, sc.dmName)
|
|
if err != nil {
|
|
// createDMSnapshot cleans up its own partial state on
|
|
// err; leave sc.live zero so the undo is a no-op.
|
|
return err
|
|
}
|
|
sc.live.BaseLoop = snapHandles.BaseLoop
|
|
sc.live.COWLoop = snapHandles.COWLoop
|
|
sc.live.DMName = snapHandles.DMName
|
|
sc.live.DMDev = snapHandles.DMDev
|
|
s.setVMHandles(*sc.vm, *sc.live)
|
|
// Fields that used to land next to the (now-deleted)
|
|
// cleanupOnErr closure. They belong with the DM
|
|
// snapshot because that's the first step producing
|
|
// runtime identity the downstream code reads back.
|
|
sc.vm.Runtime.APISockPath = sc.apiSock
|
|
sc.vm.Runtime.State = model.VMStateRunning
|
|
sc.vm.State = model.VMStateRunning
|
|
sc.vm.Runtime.LastError = ""
|
|
return nil
|
|
},
|
|
undo: func(ctx context.Context, sc *startContext) error {
|
|
if sc.live.DMName == "" && sc.live.BaseLoop == "" && sc.live.COWLoop == "" {
|
|
return nil
|
|
}
|
|
return s.net.cleanupDMSnapshot(ctx, dmSnapshotHandles{
|
|
BaseLoop: sc.live.BaseLoop,
|
|
COWLoop: sc.live.COWLoop,
|
|
DMName: sc.live.DMName,
|
|
DMDev: sc.live.DMDev,
|
|
})
|
|
},
|
|
},
|
|
{
|
|
// See the comment in the prior inline version: stale
|
|
// bitmaps from a reused COW make e2cp/e2rm refuse to
|
|
// touch the snapshot. e2fsck -fy is a no-op on a fresh
|
|
// snapshot. Exit codes 0 + 1 are both "ok" here.
|
|
name: "fsck_snapshot",
|
|
run: func(ctx context.Context, sc *startContext) error {
|
|
if _, err := s.runner.RunSudo(ctx, "e2fsck", "-fy", sc.live.DMDev); err != nil {
|
|
if code := system.ExitCode(err); code < 0 || code > 1 {
|
|
return fmt.Errorf("fsck snapshot: %w", err)
|
|
}
|
|
}
|
|
return nil
|
|
},
|
|
},
|
|
{
|
|
name: "patch_root_overlay",
|
|
createStage: "prepare_rootfs",
|
|
createDetail: "writing guest configuration",
|
|
run: func(ctx context.Context, sc *startContext) error {
|
|
return s.patchRootOverlay(ctx, *sc.vm, sc.image)
|
|
},
|
|
},
|
|
{
|
|
name: "prepare_host_features",
|
|
createStage: "prepare_host_features",
|
|
createDetail: "preparing host-side vm features",
|
|
run: func(ctx context.Context, sc *startContext) error {
|
|
return s.capHooks.prepareHosts(ctx, sc.vm, sc.image)
|
|
},
|
|
// On err, prepareHosts already cleaned up the prefix that
|
|
// succeeded before the failing capability. On success, any
|
|
// LATER step failure triggers this undo, which tears down
|
|
// ALL prepared caps via their Cleanup hooks.
|
|
undo: func(ctx context.Context, sc *startContext) error {
|
|
return s.capHooks.cleanupState(ctx, *sc.vm)
|
|
},
|
|
},
|
|
{
|
|
name: "tap",
|
|
run: func(ctx context.Context, sc *startContext) error {
|
|
tap, err := s.net.acquireTap(ctx, sc.tapName)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
sc.live.TapDevice = tap
|
|
s.setVMHandles(*sc.vm, *sc.live)
|
|
// Mirror onto VM.Runtime for NAT teardown resilience
|
|
// across daemon crashes — see vm.Runtime.TapDevice docs.
|
|
sc.vm.Runtime.TapDevice = tap
|
|
return nil
|
|
},
|
|
undo: func(ctx context.Context, sc *startContext) error {
|
|
if sc.live.TapDevice == "" {
|
|
return nil
|
|
}
|
|
return s.net.releaseTap(ctx, sc.live.TapDevice)
|
|
},
|
|
},
|
|
{
|
|
name: "metrics_file",
|
|
attrs: []any{"metrics_path", sc.vm.Runtime.MetricsPath},
|
|
run: func(_ context.Context, sc *startContext) error {
|
|
return os.WriteFile(sc.vm.Runtime.MetricsPath, nil, 0o644)
|
|
},
|
|
undo: func(_ context.Context, sc *startContext) error {
|
|
if err := os.Remove(sc.vm.Runtime.MetricsPath); err != nil && !os.IsNotExist(err) {
|
|
return err
|
|
}
|
|
return nil
|
|
},
|
|
},
|
|
{
|
|
name: "firecracker_binary",
|
|
run: func(_ context.Context, sc *startContext) error {
|
|
fcPath, err := s.net.firecrackerBinary()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
sc.fcPath = fcPath
|
|
return nil
|
|
},
|
|
},
|
|
{
|
|
name: "firecracker_launch",
|
|
attrs: []any{"log_path", sc.vm.Runtime.LogPath, "metrics_path", sc.vm.Runtime.MetricsPath},
|
|
createStage: "boot_firecracker",
|
|
createDetail: "starting firecracker",
|
|
run: func(ctx context.Context, sc *startContext) error {
|
|
kernelArgs := buildKernelArgs(*sc.vm, sc.image, s.config.BridgeIP, s.config.DefaultDNS)
|
|
machineConfig := firecracker.MachineConfig{
|
|
BinaryPath: sc.fcPath,
|
|
VMID: sc.vm.ID,
|
|
SocketPath: sc.apiSock,
|
|
LogPath: sc.vm.Runtime.LogPath,
|
|
MetricsPath: sc.vm.Runtime.MetricsPath,
|
|
KernelImagePath: sc.image.KernelPath,
|
|
InitrdPath: sc.image.InitrdPath,
|
|
KernelArgs: kernelArgs,
|
|
Drives: []firecracker.DriveConfig{{
|
|
ID: "rootfs",
|
|
Path: sc.live.DMDev,
|
|
ReadOnly: false,
|
|
IsRoot: true,
|
|
}},
|
|
TapDevice: sc.live.TapDevice,
|
|
VSockPath: sc.vm.Runtime.VSockPath,
|
|
VSockCID: sc.vm.Runtime.VSockCID,
|
|
VCPUCount: sc.vm.Spec.VCPUCount,
|
|
MemoryMiB: sc.vm.Spec.MemoryMiB,
|
|
Logger: s.logger,
|
|
}
|
|
s.capHooks.contributeMachine(&machineConfig, *sc.vm, sc.image)
|
|
machine, err := firecracker.NewMachine(ctx, machineConfig)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
sc.machine = machine
|
|
if err := machine.Start(ctx); err != nil {
|
|
// machine.Start can fail AFTER the firecracker process
|
|
// is already spawned (HTTP config phase). Record the
|
|
// PID so the undo can kill it; use a fresh ctx since
|
|
// the request ctx may be cancelled by now.
|
|
sc.live.PID = s.net.resolveFirecrackerPID(context.Background(), machine, sc.apiSock)
|
|
s.setVMHandles(*sc.vm, *sc.live)
|
|
return err
|
|
}
|
|
sc.live.PID = s.net.resolveFirecrackerPID(context.Background(), machine, sc.apiSock)
|
|
s.setVMHandles(*sc.vm, *sc.live)
|
|
op.debugStage("firecracker_started", "pid", sc.live.PID)
|
|
return nil
|
|
},
|
|
undo: func(ctx context.Context, sc *startContext) error {
|
|
var errs []error
|
|
if sc.live.PID > 0 {
|
|
if err := s.net.killVMProcess(ctx, sc.live.PID); err != nil {
|
|
errs = append(errs, err)
|
|
}
|
|
}
|
|
if err := os.Remove(sc.apiSock); err != nil && !os.IsNotExist(err) {
|
|
errs = append(errs, err)
|
|
}
|
|
if err := os.Remove(sc.vm.Runtime.VSockPath); err != nil && !os.IsNotExist(err) {
|
|
errs = append(errs, err)
|
|
}
|
|
return errors.Join(errs...)
|
|
},
|
|
},
|
|
{
|
|
name: "socket_access",
|
|
attrs: []any{"api_socket", sc.apiSock},
|
|
run: func(ctx context.Context, sc *startContext) error {
|
|
return s.net.ensureSocketAccess(ctx, sc.apiSock, "firecracker api socket")
|
|
},
|
|
},
|
|
{
|
|
name: "vsock_access",
|
|
attrs: []any{"vsock_path", sc.vm.Runtime.VSockPath, "vsock_cid", sc.vm.Runtime.VSockCID},
|
|
run: func(ctx context.Context, sc *startContext) error {
|
|
return s.net.ensureSocketAccess(ctx, sc.vm.Runtime.VSockPath, "firecracker vsock socket")
|
|
},
|
|
},
|
|
{
|
|
name: "wait_vsock_agent",
|
|
createStage: "wait_vsock_agent",
|
|
createDetail: "waiting for guest vsock agent",
|
|
run: func(ctx context.Context, sc *startContext) error {
|
|
return s.net.waitForGuestVSockAgent(ctx, sc.vm.Runtime.VSockPath, vsockReadyWait)
|
|
},
|
|
},
|
|
{
|
|
name: "post_start_features",
|
|
createStage: "wait_guest_ready",
|
|
createDetail: "waiting for guest services",
|
|
run: func(ctx context.Context, sc *startContext) error {
|
|
return s.capHooks.postStart(ctx, *sc.vm, sc.image)
|
|
},
|
|
// Capability Cleanup hooks are designed to be idempotent
|
|
// (check feature-enabled flag, no-op if nothing to undo),
|
|
// so calling cleanupState here is safe whether postStart
|
|
// reached every cap or bailed midway.
|
|
undo: func(ctx context.Context, sc *startContext) error {
|
|
return s.capHooks.cleanupState(ctx, *sc.vm)
|
|
},
|
|
},
|
|
{
|
|
name: "persist",
|
|
createStage: "finalize",
|
|
createDetail: "saving vm state",
|
|
run: func(ctx context.Context, sc *startContext) error {
|
|
system.TouchNow(sc.vm)
|
|
return s.store.UpsertVM(ctx, *sc.vm)
|
|
},
|
|
},
|
|
}
|
|
}
|