banger/internal/daemon/vm_lifecycle_steps.go
Thales Maciel b8c48765fb
daemon: skip fsck_snapshot on freshly-created system overlays
The fsck_snapshot lifecycle step exists to repair stale bitmaps in
a COW file reused from a prior aborted start — without it, the
later e2cp/e2rm calls in patch_root_overlay refuse to touch the
snapshot. On a freshly-created COW there are no stale bitmaps to
repair, so e2fsck -fy is pure overhead.

system_overlay already tracks whether it created the file this run
(sc.systemOverlayCreated, used to drive the rollback path). Reuse
that flag to skip e2fsck entirely on the create-fresh path. The
reused-COW path keeps the fsck for safety. Saves a few hundred ms
per VM create — small absolute win on top of the lazy-mkfs change,
but free.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-26 21:37:14 -03:00

423 lines
14 KiB
Go

package daemon
import (
"context"
"errors"
"fmt"
"os"
"strings"
"banger/internal/firecracker"
"banger/internal/imagepull"
"banger/internal/model"
"banger/internal/roothelper"
"banger/internal/system"
)
// buildKernelArgs assembles the kernel command line for a start.
// Direct-boot images (no initrd) get kernel-level IP config so the
// network is up before init, plus init= pointing at the universal
// first-boot wrapper. Anything else uses the plain variant.
func buildKernelArgs(vm model.VMRecord, image model.Image, bridgeIP, defaultDNS string) string {
if strings.TrimSpace(image.InitrdPath) == "" {
return system.BuildBootArgsWithKernelIP(
vm.Name, vm.Runtime.GuestIP, bridgeIP, defaultDNS,
) + " init=" + imagepull.FirstBootScriptPath
}
return system.BuildBootArgs(vm.Name)
}
// startContext is the mutable state threaded through every start
// step. `vm` and `live` are pointers so steps mutate in place —
// dodges returning redundant copies and keeps step bodies readable.
// Values computed by `startVMLocked` before the driver runs
// (apiSock, dmName, tapName) live here too so each step can read
// them without rederiving.
type startContext struct {
vm *model.VMRecord
image model.Image
live *model.VMHandles
apiSock string
dmName string
tapName string
fcPath string
// systemOverlayCreated records whether the system_overlay step
// actually created the file (vs. the file existing from a crashed
// prior attempt). The undo honours it so a leftover-but-valid
// overlay isn't deleted under us.
systemOverlayCreated bool
}
// startStep is one phase in the start-VM pipeline. Phases with no
// rollback obligation leave `undo` nil — the driver simply skips
// them on the rollback path. `createStage` / `createDetail` are
// forwarded to `vmCreateStage` so the async-create RPC caller sees
// progress; they're "" for phases that were never part of the
// user-facing progress stream.
type startStep struct {
name string
attrs []any
createStage string
createDetail string
run func(ctx context.Context, sc *startContext) error
undo func(ctx context.Context, sc *startContext) error
}
// runStartSteps walks steps in order, logging each via `op.stage`
// (and `vmCreateStage` when the step opted in). On the first
// run-err, it rolls back the prefix (including the failing step, so
// a step that acquired resources before erroring gets its undo
// fired) and returns the original err joined with any rollback err.
//
// Contract: `undo` must be safe to call even when `run` returned
// an error — check zero-value guards rather than assuming success.
// This is cheaper than a two-phase acquire/commit per step and
// matches how `cleanupPreparedCapabilities` in capabilities.go
// treats partial-success rollback.
func (s *VMService) runStartSteps(ctx context.Context, op *operationLog, sc *startContext, steps []startStep) error {
done := make([]startStep, 0, len(steps))
for _, step := range steps {
if step.createStage != "" {
vmCreateStage(ctx, step.createStage, step.createDetail)
}
op.stage(step.name, step.attrs...)
if err := step.run(ctx, sc); err != nil {
done = append(done, step) // include the failing step — see contract above
if rollbackErr := s.rollbackStartSteps(op, sc, done); rollbackErr != nil {
err = errors.Join(err, rollbackErr)
}
return err
}
done = append(done, step)
}
return nil
}
// rollbackStartSteps iterates completed steps in reverse, calling
// each non-nil `undo` with a detached context — the original ctx
// may already be cancelled (RPC client disconnect), but cleanup
// still needs to run. Undo errors are joined together; one step's
// failure doesn't short-circuit the rest.
func (s *VMService) rollbackStartSteps(op *operationLog, sc *startContext, done []startStep) error {
var err error
for i := len(done) - 1; i >= 0; i-- {
step := done[i]
if step.undo == nil {
continue
}
op.stage("rollback_" + step.name)
if undoErr := step.undo(context.Background(), sc); undoErr != nil {
err = errors.Join(err, fmt.Errorf("rollback %s: %w", step.name, undoErr))
}
}
return err
}
// buildStartSteps returns the ordered list of phases startVMLocked
// drives. Keeping the list as data (vs. a long linear method body)
// makes the phase inventory diff-readable and lets a test driver
// substitute its own step slice.
//
// Phase names MUST stay 1:1 with the prior inline version — they
// appear in daemon logs, smoke-log greps, and the async-create
// progress stream that clients read.
func (s *VMService) buildStartSteps(op *operationLog, sc *startContext) []startStep {
return []startStep{
{
name: "preflight",
createStage: "preflight",
createDetail: "checking host prerequisites",
run: func(ctx context.Context, sc *startContext) error {
if err := s.validateStartPrereqs(ctx, *sc.vm, sc.image); err != nil {
return err
}
return os.MkdirAll(sc.vm.Runtime.VMDir, 0o755)
},
},
{
name: "cleanup_runtime",
run: func(ctx context.Context, sc *startContext) error {
if err := s.cleanupRuntime(ctx, *sc.vm, true); err != nil {
return err
}
s.clearVMHandles(*sc.vm)
return nil
},
},
{
name: "bridge",
run: func(ctx context.Context, _ *startContext) error {
return s.net.ensureBridge(ctx)
},
},
{
name: "socket_dir",
run: func(_ context.Context, _ *startContext) error {
return s.net.ensureSocketDir()
},
},
{
// prepare_sockets is a new op.stage label — the prior
// inline code ran these `os.RemoveAll` calls before the
// system_overlay stage without a stage marker. Keeping a
// distinct name makes the log trace and rollback (if any
// later step fails) unambiguous.
name: "prepare_sockets",
run: func(_ context.Context, sc *startContext) error {
if err := os.RemoveAll(sc.apiSock); err != nil && !os.IsNotExist(err) {
return err
}
if err := os.RemoveAll(sc.vm.Runtime.VSockPath); err != nil && !os.IsNotExist(err) {
return err
}
return nil
},
},
{
name: "system_overlay",
attrs: []any{"overlay_path", sc.vm.Runtime.SystemOverlay},
createStage: "prepare_rootfs",
createDetail: "preparing system overlay",
run: func(ctx context.Context, sc *startContext) error {
// Record ownership BEFORE the call so a partial-truncate
// failure still triggers cleanup of the half-created file.
if !exists(sc.vm.Runtime.SystemOverlay) {
sc.systemOverlayCreated = true
}
return s.ensureSystemOverlay(ctx, sc.vm)
},
undo: func(_ context.Context, sc *startContext) error {
if !sc.systemOverlayCreated {
return nil
}
if err := os.Remove(sc.vm.Runtime.SystemOverlay); err != nil && !os.IsNotExist(err) {
return err
}
return nil
},
},
{
name: "dm_snapshot",
attrs: []any{"dm_name", sc.dmName},
createStage: "prepare_rootfs",
createDetail: "creating root filesystem snapshot",
run: func(ctx context.Context, sc *startContext) error {
snapHandles, err := s.net.createDMSnapshot(ctx, sc.image.RootfsPath, sc.vm.Runtime.SystemOverlay, sc.dmName)
if err != nil {
// createDMSnapshot cleans up its own partial state on
// err; leave sc.live zero so the undo is a no-op.
return err
}
sc.live.BaseLoop = snapHandles.BaseLoop
sc.live.COWLoop = snapHandles.COWLoop
sc.live.DMName = snapHandles.DMName
sc.live.DMDev = snapHandles.DMDev
s.setVMHandles(sc.vm, *sc.live)
// Fields that used to land next to the (now-deleted)
// cleanupOnErr closure. They belong with the DM
// snapshot because that's the first step producing
// runtime identity the downstream code reads back.
sc.vm.Runtime.APISockPath = sc.apiSock
sc.vm.Runtime.State = model.VMStateRunning
sc.vm.State = model.VMStateRunning
sc.vm.Runtime.LastError = ""
return nil
},
undo: func(ctx context.Context, sc *startContext) error {
if sc.live.DMName == "" && sc.live.BaseLoop == "" && sc.live.COWLoop == "" {
return nil
}
return s.net.cleanupDMSnapshot(ctx, dmSnapshotHandles{
BaseLoop: sc.live.BaseLoop,
COWLoop: sc.live.COWLoop,
DMName: sc.live.DMName,
DMDev: sc.live.DMDev,
})
},
},
{
// e2fsck protects against stale bitmaps in a COW reused
// from a prior aborted start — without it, e2cp/e2rm in
// patch_root_overlay refuse to touch the snapshot. On a
// freshly-created COW (system_overlay just truncated +
// created the file this run) there are no stale bitmaps
// to repair and e2fsck is pure overhead. Skip it in that
// case. Exit codes 0 + 1 are both "ok" when we do run it.
name: "fsck_snapshot",
run: func(ctx context.Context, sc *startContext) error {
if sc.systemOverlayCreated {
return nil
}
return s.privOps().FsckSnapshot(ctx, sc.live.DMDev)
},
},
{
name: "patch_root_overlay",
createStage: "prepare_rootfs",
createDetail: "writing guest configuration",
run: func(ctx context.Context, sc *startContext) error {
return s.patchRootOverlay(ctx, *sc.vm, sc.image, sc.live.DMDev)
},
},
{
name: "prepare_host_features",
createStage: "prepare_host_features",
createDetail: "preparing host-side vm features",
run: func(ctx context.Context, sc *startContext) error {
return s.capHooks.prepareHosts(ctx, sc.vm, sc.image)
},
// On err, prepareHosts already cleaned up the prefix that
// succeeded before the failing capability. On success, any
// LATER step failure triggers this undo, which tears down
// ALL prepared caps via their Cleanup hooks.
undo: func(ctx context.Context, sc *startContext) error {
return s.capHooks.cleanupState(ctx, *sc.vm)
},
},
{
name: "tap",
run: func(ctx context.Context, sc *startContext) error {
tap, err := s.net.acquireTap(ctx, sc.tapName)
if err != nil {
return err
}
sc.live.TapDevice = tap
s.setVMHandles(sc.vm, *sc.live)
return nil
},
undo: func(ctx context.Context, sc *startContext) error {
if sc.live.TapDevice == "" {
return nil
}
return s.net.releaseTap(ctx, sc.live.TapDevice)
},
},
{
name: "metrics_file",
attrs: []any{"metrics_path", sc.vm.Runtime.MetricsPath},
run: func(_ context.Context, sc *startContext) error {
return os.WriteFile(sc.vm.Runtime.MetricsPath, nil, 0o644)
},
undo: func(_ context.Context, sc *startContext) error {
if err := os.Remove(sc.vm.Runtime.MetricsPath); err != nil && !os.IsNotExist(err) {
return err
}
return nil
},
},
{
name: "firecracker_binary",
run: func(ctx context.Context, sc *startContext) error {
fcPath, err := s.net.firecrackerBinary(ctx)
if err != nil {
return err
}
sc.fcPath = fcPath
return nil
},
},
{
name: "firecracker_launch",
attrs: []any{"log_path", sc.vm.Runtime.LogPath, "metrics_path", sc.vm.Runtime.MetricsPath},
createStage: "boot_firecracker",
createDetail: "starting firecracker",
run: func(ctx context.Context, sc *startContext) error {
kernelArgs := buildKernelArgs(*sc.vm, sc.image, s.config.BridgeIP, s.config.DefaultDNS)
launchReq := roothelper.FirecrackerLaunchRequest{
BinaryPath: sc.fcPath,
VMID: sc.vm.ID,
SocketPath: sc.apiSock,
LogPath: sc.vm.Runtime.LogPath,
MetricsPath: sc.vm.Runtime.MetricsPath,
KernelImagePath: sc.image.KernelPath,
InitrdPath: sc.image.InitrdPath,
KernelArgs: kernelArgs,
Drives: []firecracker.DriveConfig{{
ID: "rootfs",
Path: sc.live.DMDev,
ReadOnly: false,
IsRoot: true,
}},
TapDevice: sc.live.TapDevice,
VSockPath: sc.vm.Runtime.VSockPath,
VSockCID: sc.vm.Runtime.VSockCID,
VCPUCount: sc.vm.Spec.VCPUCount,
MemoryMiB: sc.vm.Spec.MemoryMiB,
}
machineConfig := firecracker.MachineConfig{Drives: launchReq.Drives}
s.capHooks.contributeMachine(&machineConfig, *sc.vm, sc.image)
launchReq.Drives = machineConfig.Drives
pid, err := s.privOps().LaunchFirecracker(ctx, launchReq)
if err != nil {
return err
}
sc.live.PID = pid
s.setVMHandles(sc.vm, *sc.live)
op.debugStage("firecracker_started", "pid", sc.live.PID)
return nil
},
undo: func(ctx context.Context, sc *startContext) error {
var errs []error
if sc.live.PID > 0 {
if err := s.net.killVMProcess(ctx, sc.live.PID); err != nil {
errs = append(errs, err)
}
}
if err := os.Remove(sc.apiSock); err != nil && !os.IsNotExist(err) {
errs = append(errs, err)
}
if err := os.Remove(sc.vm.Runtime.VSockPath); err != nil && !os.IsNotExist(err) {
errs = append(errs, err)
}
return errors.Join(errs...)
},
},
{
name: "socket_access",
attrs: []any{"api_socket", sc.apiSock},
run: func(ctx context.Context, sc *startContext) error {
return s.net.ensureSocketAccess(ctx, sc.apiSock, "firecracker api socket")
},
},
{
name: "vsock_access",
attrs: []any{"vsock_path", sc.vm.Runtime.VSockPath, "vsock_cid", sc.vm.Runtime.VSockCID},
run: func(ctx context.Context, sc *startContext) error {
return s.net.ensureSocketAccess(ctx, sc.vm.Runtime.VSockPath, "firecracker vsock socket")
},
},
{
name: "wait_vsock_agent",
createStage: "wait_vsock_agent",
createDetail: "waiting for guest vsock agent",
run: func(ctx context.Context, sc *startContext) error {
return s.net.waitForGuestVSockAgent(ctx, sc.vm.Runtime.VSockPath, vsockReadyWait)
},
},
{
name: "post_start_features",
createStage: "wait_guest_ready",
createDetail: "waiting for guest services",
run: func(ctx context.Context, sc *startContext) error {
return s.capHooks.postStart(ctx, *sc.vm, sc.image)
},
// Capability Cleanup hooks are designed to be idempotent
// (check feature-enabled flag, no-op if nothing to undo),
// so calling cleanupState here is safe whether postStart
// reached every cap or bailed midway.
undo: func(ctx context.Context, sc *startContext) error {
return s.capHooks.cleanupState(ctx, *sc.vm)
},
},
{
name: "persist",
createStage: "finalize",
createDetail: "saving vm state",
run: func(ctx context.Context, sc *startContext) error {
system.TouchNow(sc.vm)
return s.store.UpsertVM(ctx, *sc.vm)
},
},
}
}