banger/internal/daemon/vm_lifecycle_steps.go
Thales Maciel 59e48e830b
daemon: split owner daemon from root helper
Move the supported systemd path to two services: an owner-user bangerd for
orchestration and a narrow root helper for bridge/tap, NAT/resolver, dm/loop,
and Firecracker ownership. This removes repeated sudo from daily vm and image
flows without leaving the general daemon running as root.

Add install metadata, system install/status/restart/uninstall commands, and a
system-owned runtime layout. Keep user SSH/config material in the owner home,
lock file_sync to the owner home, and move daemon known_hosts handling out of
the old root-owned control path.

Route privileged lifecycle steps through typed privilegedOps calls, harden the
two systemd units, and rewrite smoke plus docs around the supported service
model.

Verified with make build, make test, make lint, and make smoke on the
supported systemd host path.
2026-04-26 12:43:17 -03:00

417 lines
14 KiB
Go

package daemon
import (
"context"
"errors"
"fmt"
"os"
"strings"
"banger/internal/firecracker"
"banger/internal/imagepull"
"banger/internal/model"
"banger/internal/roothelper"
"banger/internal/system"
)
// buildKernelArgs assembles the kernel command line for a start.
// Direct-boot images (no initrd) get kernel-level IP config so the
// network is up before init, plus init= pointing at the universal
// first-boot wrapper. Anything else uses the plain variant.
func buildKernelArgs(vm model.VMRecord, image model.Image, bridgeIP, defaultDNS string) string {
if strings.TrimSpace(image.InitrdPath) == "" {
return system.BuildBootArgsWithKernelIP(
vm.Name, vm.Runtime.GuestIP, bridgeIP, defaultDNS,
) + " init=" + imagepull.FirstBootScriptPath
}
return system.BuildBootArgs(vm.Name)
}
// startContext is the mutable state threaded through every start
// step. `vm` and `live` are pointers so steps mutate in place —
// dodges returning redundant copies and keeps step bodies readable.
// Values computed by `startVMLocked` before the driver runs
// (apiSock, dmName, tapName) live here too so each step can read
// them without rederiving.
type startContext struct {
vm *model.VMRecord
image model.Image
live *model.VMHandles
apiSock string
dmName string
tapName string
fcPath string
// systemOverlayCreated records whether the system_overlay step
// actually created the file (vs. the file existing from a crashed
// prior attempt). The undo honours it so a leftover-but-valid
// overlay isn't deleted under us.
systemOverlayCreated bool
}
// startStep is one phase in the start-VM pipeline. Phases with no
// rollback obligation leave `undo` nil — the driver simply skips
// them on the rollback path. `createStage` / `createDetail` are
// forwarded to `vmCreateStage` so the async-create RPC caller sees
// progress; they're "" for phases that were never part of the
// user-facing progress stream.
type startStep struct {
name string
attrs []any
createStage string
createDetail string
run func(ctx context.Context, sc *startContext) error
undo func(ctx context.Context, sc *startContext) error
}
// runStartSteps walks steps in order, logging each via `op.stage`
// (and `vmCreateStage` when the step opted in). On the first
// run-err, it rolls back the prefix (including the failing step, so
// a step that acquired resources before erroring gets its undo
// fired) and returns the original err joined with any rollback err.
//
// Contract: `undo` must be safe to call even when `run` returned
// an error — check zero-value guards rather than assuming success.
// This is cheaper than a two-phase acquire/commit per step and
// matches how `cleanupPreparedCapabilities` in capabilities.go
// treats partial-success rollback.
func (s *VMService) runStartSteps(ctx context.Context, op *operationLog, sc *startContext, steps []startStep) error {
done := make([]startStep, 0, len(steps))
for _, step := range steps {
if step.createStage != "" {
vmCreateStage(ctx, step.createStage, step.createDetail)
}
op.stage(step.name, step.attrs...)
if err := step.run(ctx, sc); err != nil {
done = append(done, step) // include the failing step — see contract above
if rollbackErr := s.rollbackStartSteps(op, sc, done); rollbackErr != nil {
err = errors.Join(err, rollbackErr)
}
return err
}
done = append(done, step)
}
return nil
}
// rollbackStartSteps iterates completed steps in reverse, calling
// each non-nil `undo` with a detached context — the original ctx
// may already be cancelled (RPC client disconnect), but cleanup
// still needs to run. Undo errors are joined together; one step's
// failure doesn't short-circuit the rest.
func (s *VMService) rollbackStartSteps(op *operationLog, sc *startContext, done []startStep) error {
var err error
for i := len(done) - 1; i >= 0; i-- {
step := done[i]
if step.undo == nil {
continue
}
op.stage("rollback_" + step.name)
if undoErr := step.undo(context.Background(), sc); undoErr != nil {
err = errors.Join(err, fmt.Errorf("rollback %s: %w", step.name, undoErr))
}
}
return err
}
// buildStartSteps returns the ordered list of phases startVMLocked
// drives. Keeping the list as data (vs. a long linear method body)
// makes the phase inventory diff-readable and lets a test driver
// substitute its own step slice.
//
// Phase names MUST stay 1:1 with the prior inline version — they
// appear in daemon logs, smoke-log greps, and the async-create
// progress stream that clients read.
func (s *VMService) buildStartSteps(op *operationLog, sc *startContext) []startStep {
return []startStep{
{
name: "preflight",
createStage: "preflight",
createDetail: "checking host prerequisites",
run: func(ctx context.Context, sc *startContext) error {
if err := s.validateStartPrereqs(ctx, *sc.vm, sc.image); err != nil {
return err
}
return os.MkdirAll(sc.vm.Runtime.VMDir, 0o755)
},
},
{
name: "cleanup_runtime",
run: func(ctx context.Context, sc *startContext) error {
if err := s.cleanupRuntime(ctx, *sc.vm, true); err != nil {
return err
}
s.clearVMHandles(*sc.vm)
return nil
},
},
{
name: "bridge",
run: func(ctx context.Context, _ *startContext) error {
return s.net.ensureBridge(ctx)
},
},
{
name: "socket_dir",
run: func(_ context.Context, _ *startContext) error {
return s.net.ensureSocketDir()
},
},
{
// prepare_sockets is a new op.stage label — the prior
// inline code ran these `os.RemoveAll` calls before the
// system_overlay stage without a stage marker. Keeping a
// distinct name makes the log trace and rollback (if any
// later step fails) unambiguous.
name: "prepare_sockets",
run: func(_ context.Context, sc *startContext) error {
if err := os.RemoveAll(sc.apiSock); err != nil && !os.IsNotExist(err) {
return err
}
if err := os.RemoveAll(sc.vm.Runtime.VSockPath); err != nil && !os.IsNotExist(err) {
return err
}
return nil
},
},
{
name: "system_overlay",
attrs: []any{"overlay_path", sc.vm.Runtime.SystemOverlay},
createStage: "prepare_rootfs",
createDetail: "preparing system overlay",
run: func(ctx context.Context, sc *startContext) error {
// Record ownership BEFORE the call so a partial-truncate
// failure still triggers cleanup of the half-created file.
if !exists(sc.vm.Runtime.SystemOverlay) {
sc.systemOverlayCreated = true
}
return s.ensureSystemOverlay(ctx, sc.vm)
},
undo: func(_ context.Context, sc *startContext) error {
if !sc.systemOverlayCreated {
return nil
}
if err := os.Remove(sc.vm.Runtime.SystemOverlay); err != nil && !os.IsNotExist(err) {
return err
}
return nil
},
},
{
name: "dm_snapshot",
attrs: []any{"dm_name", sc.dmName},
createStage: "prepare_rootfs",
createDetail: "creating root filesystem snapshot",
run: func(ctx context.Context, sc *startContext) error {
snapHandles, err := s.net.createDMSnapshot(ctx, sc.image.RootfsPath, sc.vm.Runtime.SystemOverlay, sc.dmName)
if err != nil {
// createDMSnapshot cleans up its own partial state on
// err; leave sc.live zero so the undo is a no-op.
return err
}
sc.live.BaseLoop = snapHandles.BaseLoop
sc.live.COWLoop = snapHandles.COWLoop
sc.live.DMName = snapHandles.DMName
sc.live.DMDev = snapHandles.DMDev
s.setVMHandles(sc.vm, *sc.live)
// Fields that used to land next to the (now-deleted)
// cleanupOnErr closure. They belong with the DM
// snapshot because that's the first step producing
// runtime identity the downstream code reads back.
sc.vm.Runtime.APISockPath = sc.apiSock
sc.vm.Runtime.State = model.VMStateRunning
sc.vm.State = model.VMStateRunning
sc.vm.Runtime.LastError = ""
return nil
},
undo: func(ctx context.Context, sc *startContext) error {
if sc.live.DMName == "" && sc.live.BaseLoop == "" && sc.live.COWLoop == "" {
return nil
}
return s.net.cleanupDMSnapshot(ctx, dmSnapshotHandles{
BaseLoop: sc.live.BaseLoop,
COWLoop: sc.live.COWLoop,
DMName: sc.live.DMName,
DMDev: sc.live.DMDev,
})
},
},
{
// See the comment in the prior inline version: stale
// bitmaps from a reused COW make e2cp/e2rm refuse to
// touch the snapshot. e2fsck -fy is a no-op on a fresh
// snapshot. Exit codes 0 + 1 are both "ok" here.
name: "fsck_snapshot",
run: func(ctx context.Context, sc *startContext) error {
return s.privOps().FsckSnapshot(ctx, sc.live.DMDev)
},
},
{
name: "patch_root_overlay",
createStage: "prepare_rootfs",
createDetail: "writing guest configuration",
run: func(ctx context.Context, sc *startContext) error {
return s.patchRootOverlay(ctx, *sc.vm, sc.image, sc.live.DMDev)
},
},
{
name: "prepare_host_features",
createStage: "prepare_host_features",
createDetail: "preparing host-side vm features",
run: func(ctx context.Context, sc *startContext) error {
return s.capHooks.prepareHosts(ctx, sc.vm, sc.image)
},
// On err, prepareHosts already cleaned up the prefix that
// succeeded before the failing capability. On success, any
// LATER step failure triggers this undo, which tears down
// ALL prepared caps via their Cleanup hooks.
undo: func(ctx context.Context, sc *startContext) error {
return s.capHooks.cleanupState(ctx, *sc.vm)
},
},
{
name: "tap",
run: func(ctx context.Context, sc *startContext) error {
tap, err := s.net.acquireTap(ctx, sc.tapName)
if err != nil {
return err
}
sc.live.TapDevice = tap
s.setVMHandles(sc.vm, *sc.live)
return nil
},
undo: func(ctx context.Context, sc *startContext) error {
if sc.live.TapDevice == "" {
return nil
}
return s.net.releaseTap(ctx, sc.live.TapDevice)
},
},
{
name: "metrics_file",
attrs: []any{"metrics_path", sc.vm.Runtime.MetricsPath},
run: func(_ context.Context, sc *startContext) error {
return os.WriteFile(sc.vm.Runtime.MetricsPath, nil, 0o644)
},
undo: func(_ context.Context, sc *startContext) error {
if err := os.Remove(sc.vm.Runtime.MetricsPath); err != nil && !os.IsNotExist(err) {
return err
}
return nil
},
},
{
name: "firecracker_binary",
run: func(ctx context.Context, sc *startContext) error {
fcPath, err := s.net.firecrackerBinary(ctx)
if err != nil {
return err
}
sc.fcPath = fcPath
return nil
},
},
{
name: "firecracker_launch",
attrs: []any{"log_path", sc.vm.Runtime.LogPath, "metrics_path", sc.vm.Runtime.MetricsPath},
createStage: "boot_firecracker",
createDetail: "starting firecracker",
run: func(ctx context.Context, sc *startContext) error {
kernelArgs := buildKernelArgs(*sc.vm, sc.image, s.config.BridgeIP, s.config.DefaultDNS)
launchReq := roothelper.FirecrackerLaunchRequest{
BinaryPath: sc.fcPath,
VMID: sc.vm.ID,
SocketPath: sc.apiSock,
LogPath: sc.vm.Runtime.LogPath,
MetricsPath: sc.vm.Runtime.MetricsPath,
KernelImagePath: sc.image.KernelPath,
InitrdPath: sc.image.InitrdPath,
KernelArgs: kernelArgs,
Drives: []firecracker.DriveConfig{{
ID: "rootfs",
Path: sc.live.DMDev,
ReadOnly: false,
IsRoot: true,
}},
TapDevice: sc.live.TapDevice,
VSockPath: sc.vm.Runtime.VSockPath,
VSockCID: sc.vm.Runtime.VSockCID,
VCPUCount: sc.vm.Spec.VCPUCount,
MemoryMiB: sc.vm.Spec.MemoryMiB,
}
machineConfig := firecracker.MachineConfig{Drives: launchReq.Drives}
s.capHooks.contributeMachine(&machineConfig, *sc.vm, sc.image)
launchReq.Drives = machineConfig.Drives
pid, err := s.privOps().LaunchFirecracker(ctx, launchReq)
if err != nil {
return err
}
sc.live.PID = pid
s.setVMHandles(sc.vm, *sc.live)
op.debugStage("firecracker_started", "pid", sc.live.PID)
return nil
},
undo: func(ctx context.Context, sc *startContext) error {
var errs []error
if sc.live.PID > 0 {
if err := s.net.killVMProcess(ctx, sc.live.PID); err != nil {
errs = append(errs, err)
}
}
if err := os.Remove(sc.apiSock); err != nil && !os.IsNotExist(err) {
errs = append(errs, err)
}
if err := os.Remove(sc.vm.Runtime.VSockPath); err != nil && !os.IsNotExist(err) {
errs = append(errs, err)
}
return errors.Join(errs...)
},
},
{
name: "socket_access",
attrs: []any{"api_socket", sc.apiSock},
run: func(ctx context.Context, sc *startContext) error {
return s.net.ensureSocketAccess(ctx, sc.apiSock, "firecracker api socket")
},
},
{
name: "vsock_access",
attrs: []any{"vsock_path", sc.vm.Runtime.VSockPath, "vsock_cid", sc.vm.Runtime.VSockCID},
run: func(ctx context.Context, sc *startContext) error {
return s.net.ensureSocketAccess(ctx, sc.vm.Runtime.VSockPath, "firecracker vsock socket")
},
},
{
name: "wait_vsock_agent",
createStage: "wait_vsock_agent",
createDetail: "waiting for guest vsock agent",
run: func(ctx context.Context, sc *startContext) error {
return s.net.waitForGuestVSockAgent(ctx, sc.vm.Runtime.VSockPath, vsockReadyWait)
},
},
{
name: "post_start_features",
createStage: "wait_guest_ready",
createDetail: "waiting for guest services",
run: func(ctx context.Context, sc *startContext) error {
return s.capHooks.postStart(ctx, *sc.vm, sc.image)
},
// Capability Cleanup hooks are designed to be idempotent
// (check feature-enabled flag, no-op if nothing to undo),
// so calling cleanupState here is safe whether postStart
// reached every cap or bailed midway.
undo: func(ctx context.Context, sc *startContext) error {
return s.capHooks.cleanupState(ctx, *sc.vm)
},
},
{
name: "persist",
createStage: "finalize",
createDetail: "saving vm state",
run: func(ctx context.Context, sc *startContext) error {
system.TouchNow(sc.vm)
return s.store.UpsertVM(ctx, *sc.vm)
},
},
}
}