Each VM's firecracker now runs inside a per-VM chroot dropped to the registered owner UID via firecracker-jailer. Closes the broad ambient- sudo escalation surface that survived Phase A: the helper still needs caps for tap/bridge/dm/loop/iptables, but the VMM itself no longer runs as root in the host root filesystem. The host helper stages each chroot up front: hard-links the kernel and (optional) initrd, mknods block-device drives + /dev/vhost-vsock, copies in the firecracker binary (jailer opens it O_RDWR so a ro bind fails with EROFS), and bind-mounts /usr/lib + /lib trees read-only so the dynamic linker can resolve. Self-binds the chroot first so the findmnt-guarded cleanup can recurse safely. AF_UNIX sun_path is 108 bytes; the chroot path easily blows past that. Daemon-side launch pre-symlinks the short request socket path to the long chroot socket before Machine.Start so the SDK's poll/connect sees the short path while the kernel resolves to the chroot socket. --new-pid-ns is intentionally disabled — jailer's PID-namespace fork makes the SDK see the parent exit and tear the API socket down too early. CapabilityBoundingSet for the helper expands to add CAP_FOWNER, CAP_KILL, CAP_MKNOD, CAP_SETGID, CAP_SETUID, CAP_SYS_CHROOT alongside the existing CAP_CHOWN/CAP_DAC_OVERRIDE/CAP_NET_ADMIN/CAP_NET_RAW/ CAP_SYS_ADMIN. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
442 lines
15 KiB
Go
442 lines
15 KiB
Go
package daemon
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"os"
|
|
"strings"
|
|
|
|
"banger/internal/firecracker"
|
|
"banger/internal/imagepull"
|
|
"banger/internal/model"
|
|
"banger/internal/roothelper"
|
|
"banger/internal/system"
|
|
)
|
|
|
|
// jailerOpts returns the jailer launch options to bundle in the firecracker
|
|
// launch request, or nil when the jailer is disabled or misconfigured.
|
|
// nil makes the launch fall back to the legacy direct-firecracker path.
|
|
func (s *VMService) jailerOpts() *roothelper.JailerLaunchOpts {
|
|
if !s.config.JailerEnabled {
|
|
return nil
|
|
}
|
|
if strings.TrimSpace(s.config.JailerBin) == "" || strings.TrimSpace(s.config.JailerChrootBase) == "" {
|
|
return nil
|
|
}
|
|
return &roothelper.JailerLaunchOpts{
|
|
Binary: s.config.JailerBin,
|
|
ChrootBaseDir: s.config.JailerChrootBase,
|
|
UID: os.Getuid(),
|
|
GID: os.Getgid(),
|
|
}
|
|
}
|
|
|
|
// buildKernelArgs assembles the kernel command line for a start.
|
|
// Direct-boot images (no initrd) get kernel-level IP config so the
|
|
// network is up before init, plus init= pointing at the universal
|
|
// first-boot wrapper. Anything else uses the plain variant.
|
|
func buildKernelArgs(vm model.VMRecord, image model.Image, bridgeIP, defaultDNS string) string {
|
|
if strings.TrimSpace(image.InitrdPath) == "" {
|
|
return system.BuildBootArgsWithKernelIP(
|
|
vm.Name, vm.Runtime.GuestIP, bridgeIP, defaultDNS,
|
|
) + " init=" + imagepull.FirstBootScriptPath
|
|
}
|
|
return system.BuildBootArgs(vm.Name)
|
|
}
|
|
|
|
// startContext is the mutable state threaded through every start
|
|
// step. `vm` and `live` are pointers so steps mutate in place —
|
|
// dodges returning redundant copies and keeps step bodies readable.
|
|
// Values computed by `startVMLocked` before the driver runs
|
|
// (apiSock, dmName, tapName) live here too so each step can read
|
|
// them without rederiving.
|
|
type startContext struct {
|
|
vm *model.VMRecord
|
|
image model.Image
|
|
live *model.VMHandles
|
|
apiSock string
|
|
dmName string
|
|
tapName string
|
|
fcPath string
|
|
|
|
// systemOverlayCreated records whether the system_overlay step
|
|
// actually created the file (vs. the file existing from a crashed
|
|
// prior attempt). The undo honours it so a leftover-but-valid
|
|
// overlay isn't deleted under us.
|
|
systemOverlayCreated bool
|
|
}
|
|
|
|
// startStep is one phase in the start-VM pipeline. Phases with no
|
|
// rollback obligation leave `undo` nil — the driver simply skips
|
|
// them on the rollback path. `createStage` / `createDetail` are
|
|
// forwarded to `vmCreateStage` so the async-create RPC caller sees
|
|
// progress; they're "" for phases that were never part of the
|
|
// user-facing progress stream.
|
|
type startStep struct {
|
|
name string
|
|
attrs []any
|
|
createStage string
|
|
createDetail string
|
|
run func(ctx context.Context, sc *startContext) error
|
|
undo func(ctx context.Context, sc *startContext) error
|
|
}
|
|
|
|
// runStartSteps walks steps in order, logging each via `op.stage`
|
|
// (and `vmCreateStage` when the step opted in). On the first
|
|
// run-err, it rolls back the prefix (including the failing step, so
|
|
// a step that acquired resources before erroring gets its undo
|
|
// fired) and returns the original err joined with any rollback err.
|
|
//
|
|
// Contract: `undo` must be safe to call even when `run` returned
|
|
// an error — check zero-value guards rather than assuming success.
|
|
// This is cheaper than a two-phase acquire/commit per step and
|
|
// matches how `cleanupPreparedCapabilities` in capabilities.go
|
|
// treats partial-success rollback.
|
|
func (s *VMService) runStartSteps(ctx context.Context, op *operationLog, sc *startContext, steps []startStep) error {
|
|
done := make([]startStep, 0, len(steps))
|
|
for _, step := range steps {
|
|
if step.createStage != "" {
|
|
vmCreateStage(ctx, step.createStage, step.createDetail)
|
|
}
|
|
op.stage(step.name, step.attrs...)
|
|
if err := step.run(ctx, sc); err != nil {
|
|
done = append(done, step) // include the failing step — see contract above
|
|
if rollbackErr := s.rollbackStartSteps(op, sc, done); rollbackErr != nil {
|
|
err = errors.Join(err, rollbackErr)
|
|
}
|
|
return err
|
|
}
|
|
done = append(done, step)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// rollbackStartSteps iterates completed steps in reverse, calling
|
|
// each non-nil `undo` with a detached context — the original ctx
|
|
// may already be cancelled (RPC client disconnect), but cleanup
|
|
// still needs to run. Undo errors are joined together; one step's
|
|
// failure doesn't short-circuit the rest.
|
|
func (s *VMService) rollbackStartSteps(op *operationLog, sc *startContext, done []startStep) error {
|
|
var err error
|
|
for i := len(done) - 1; i >= 0; i-- {
|
|
step := done[i]
|
|
if step.undo == nil {
|
|
continue
|
|
}
|
|
op.stage("rollback_" + step.name)
|
|
if undoErr := step.undo(context.Background(), sc); undoErr != nil {
|
|
err = errors.Join(err, fmt.Errorf("rollback %s: %w", step.name, undoErr))
|
|
}
|
|
}
|
|
return err
|
|
}
|
|
|
|
// buildStartSteps returns the ordered list of phases startVMLocked
|
|
// drives. Keeping the list as data (vs. a long linear method body)
|
|
// makes the phase inventory diff-readable and lets a test driver
|
|
// substitute its own step slice.
|
|
//
|
|
// Phase names MUST stay 1:1 with the prior inline version — they
|
|
// appear in daemon logs, smoke-log greps, and the async-create
|
|
// progress stream that clients read.
|
|
func (s *VMService) buildStartSteps(op *operationLog, sc *startContext) []startStep {
|
|
return []startStep{
|
|
{
|
|
name: "preflight",
|
|
createStage: "preflight",
|
|
createDetail: "checking host prerequisites",
|
|
run: func(ctx context.Context, sc *startContext) error {
|
|
if err := s.validateStartPrereqs(ctx, *sc.vm, sc.image); err != nil {
|
|
return err
|
|
}
|
|
return os.MkdirAll(sc.vm.Runtime.VMDir, 0o755)
|
|
},
|
|
},
|
|
{
|
|
name: "cleanup_runtime",
|
|
run: func(ctx context.Context, sc *startContext) error {
|
|
if err := s.cleanupRuntime(ctx, *sc.vm, true); err != nil {
|
|
return err
|
|
}
|
|
s.clearVMHandles(*sc.vm)
|
|
return nil
|
|
},
|
|
},
|
|
{
|
|
name: "bridge",
|
|
run: func(ctx context.Context, _ *startContext) error {
|
|
return s.net.ensureBridge(ctx)
|
|
},
|
|
},
|
|
{
|
|
name: "socket_dir",
|
|
run: func(_ context.Context, _ *startContext) error {
|
|
return s.net.ensureSocketDir()
|
|
},
|
|
},
|
|
{
|
|
// prepare_sockets is a new op.stage label — the prior
|
|
// inline code ran these `os.RemoveAll` calls before the
|
|
// system_overlay stage without a stage marker. Keeping a
|
|
// distinct name makes the log trace and rollback (if any
|
|
// later step fails) unambiguous.
|
|
name: "prepare_sockets",
|
|
run: func(_ context.Context, sc *startContext) error {
|
|
if err := os.RemoveAll(sc.apiSock); err != nil && !os.IsNotExist(err) {
|
|
return err
|
|
}
|
|
if err := os.RemoveAll(sc.vm.Runtime.VSockPath); err != nil && !os.IsNotExist(err) {
|
|
return err
|
|
}
|
|
return nil
|
|
},
|
|
},
|
|
{
|
|
name: "system_overlay",
|
|
attrs: []any{"overlay_path", sc.vm.Runtime.SystemOverlay},
|
|
createStage: "prepare_rootfs",
|
|
createDetail: "preparing system overlay",
|
|
run: func(ctx context.Context, sc *startContext) error {
|
|
// Record ownership BEFORE the call so a partial-truncate
|
|
// failure still triggers cleanup of the half-created file.
|
|
if !exists(sc.vm.Runtime.SystemOverlay) {
|
|
sc.systemOverlayCreated = true
|
|
}
|
|
return s.ensureSystemOverlay(ctx, sc.vm)
|
|
},
|
|
undo: func(_ context.Context, sc *startContext) error {
|
|
if !sc.systemOverlayCreated {
|
|
return nil
|
|
}
|
|
if err := os.Remove(sc.vm.Runtime.SystemOverlay); err != nil && !os.IsNotExist(err) {
|
|
return err
|
|
}
|
|
return nil
|
|
},
|
|
},
|
|
{
|
|
name: "dm_snapshot",
|
|
attrs: []any{"dm_name", sc.dmName},
|
|
createStage: "prepare_rootfs",
|
|
createDetail: "creating root filesystem snapshot",
|
|
run: func(ctx context.Context, sc *startContext) error {
|
|
snapHandles, err := s.net.createDMSnapshot(ctx, sc.image.RootfsPath, sc.vm.Runtime.SystemOverlay, sc.dmName)
|
|
if err != nil {
|
|
// createDMSnapshot cleans up its own partial state on
|
|
// err; leave sc.live zero so the undo is a no-op.
|
|
return err
|
|
}
|
|
sc.live.BaseLoop = snapHandles.BaseLoop
|
|
sc.live.COWLoop = snapHandles.COWLoop
|
|
sc.live.DMName = snapHandles.DMName
|
|
sc.live.DMDev = snapHandles.DMDev
|
|
s.setVMHandles(sc.vm, *sc.live)
|
|
// Fields that used to land next to the (now-deleted)
|
|
// cleanupOnErr closure. They belong with the DM
|
|
// snapshot because that's the first step producing
|
|
// runtime identity the downstream code reads back.
|
|
sc.vm.Runtime.APISockPath = sc.apiSock
|
|
sc.vm.Runtime.State = model.VMStateRunning
|
|
sc.vm.State = model.VMStateRunning
|
|
sc.vm.Runtime.LastError = ""
|
|
return nil
|
|
},
|
|
undo: func(ctx context.Context, sc *startContext) error {
|
|
if sc.live.DMName == "" && sc.live.BaseLoop == "" && sc.live.COWLoop == "" {
|
|
return nil
|
|
}
|
|
return s.net.cleanupDMSnapshot(ctx, dmSnapshotHandles{
|
|
BaseLoop: sc.live.BaseLoop,
|
|
COWLoop: sc.live.COWLoop,
|
|
DMName: sc.live.DMName,
|
|
DMDev: sc.live.DMDev,
|
|
})
|
|
},
|
|
},
|
|
{
|
|
// e2fsck protects against stale bitmaps in a COW reused
|
|
// from a prior aborted start — without it, e2cp/e2rm in
|
|
// patch_root_overlay refuse to touch the snapshot. On a
|
|
// freshly-created COW (system_overlay just truncated +
|
|
// created the file this run) there are no stale bitmaps
|
|
// to repair and e2fsck is pure overhead. Skip it in that
|
|
// case. Exit codes 0 + 1 are both "ok" when we do run it.
|
|
name: "fsck_snapshot",
|
|
run: func(ctx context.Context, sc *startContext) error {
|
|
if sc.systemOverlayCreated {
|
|
return nil
|
|
}
|
|
return s.privOps().FsckSnapshot(ctx, sc.live.DMDev)
|
|
},
|
|
},
|
|
{
|
|
name: "patch_root_overlay",
|
|
createStage: "prepare_rootfs",
|
|
createDetail: "writing guest configuration",
|
|
run: func(ctx context.Context, sc *startContext) error {
|
|
return s.patchRootOverlay(ctx, *sc.vm, sc.image, sc.live.DMDev)
|
|
},
|
|
},
|
|
{
|
|
name: "prepare_host_features",
|
|
createStage: "prepare_host_features",
|
|
createDetail: "preparing host-side vm features",
|
|
run: func(ctx context.Context, sc *startContext) error {
|
|
return s.capHooks.prepareHosts(ctx, sc.vm, sc.image)
|
|
},
|
|
// On err, prepareHosts already cleaned up the prefix that
|
|
// succeeded before the failing capability. On success, any
|
|
// LATER step failure triggers this undo, which tears down
|
|
// ALL prepared caps via their Cleanup hooks.
|
|
undo: func(ctx context.Context, sc *startContext) error {
|
|
return s.capHooks.cleanupState(ctx, *sc.vm)
|
|
},
|
|
},
|
|
{
|
|
name: "tap",
|
|
run: func(ctx context.Context, sc *startContext) error {
|
|
tap, err := s.net.acquireTap(ctx, sc.tapName)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
sc.live.TapDevice = tap
|
|
s.setVMHandles(sc.vm, *sc.live)
|
|
return nil
|
|
},
|
|
undo: func(ctx context.Context, sc *startContext) error {
|
|
if sc.live.TapDevice == "" {
|
|
return nil
|
|
}
|
|
return s.net.releaseTap(ctx, sc.live.TapDevice)
|
|
},
|
|
},
|
|
{
|
|
name: "metrics_file",
|
|
attrs: []any{"metrics_path", sc.vm.Runtime.MetricsPath},
|
|
run: func(_ context.Context, sc *startContext) error {
|
|
return os.WriteFile(sc.vm.Runtime.MetricsPath, nil, 0o644)
|
|
},
|
|
undo: func(_ context.Context, sc *startContext) error {
|
|
if err := os.Remove(sc.vm.Runtime.MetricsPath); err != nil && !os.IsNotExist(err) {
|
|
return err
|
|
}
|
|
return nil
|
|
},
|
|
},
|
|
{
|
|
name: "firecracker_binary",
|
|
run: func(ctx context.Context, sc *startContext) error {
|
|
fcPath, err := s.net.firecrackerBinary(ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
sc.fcPath = fcPath
|
|
return nil
|
|
},
|
|
},
|
|
{
|
|
name: "firecracker_launch",
|
|
attrs: []any{"log_path", sc.vm.Runtime.LogPath, "metrics_path", sc.vm.Runtime.MetricsPath},
|
|
createStage: "boot_firecracker",
|
|
createDetail: "starting firecracker",
|
|
run: func(ctx context.Context, sc *startContext) error {
|
|
kernelArgs := buildKernelArgs(*sc.vm, sc.image, s.config.BridgeIP, s.config.DefaultDNS)
|
|
launchReq := roothelper.FirecrackerLaunchRequest{
|
|
BinaryPath: sc.fcPath,
|
|
VMID: sc.vm.ID,
|
|
SocketPath: sc.apiSock,
|
|
LogPath: sc.vm.Runtime.LogPath,
|
|
MetricsPath: sc.vm.Runtime.MetricsPath,
|
|
KernelImagePath: sc.image.KernelPath,
|
|
InitrdPath: sc.image.InitrdPath,
|
|
KernelArgs: kernelArgs,
|
|
Drives: []firecracker.DriveConfig{{
|
|
ID: "rootfs",
|
|
Path: sc.live.DMDev,
|
|
ReadOnly: false,
|
|
IsRoot: true,
|
|
}},
|
|
TapDevice: sc.live.TapDevice,
|
|
VSockPath: sc.vm.Runtime.VSockPath,
|
|
VSockCID: sc.vm.Runtime.VSockCID,
|
|
VCPUCount: sc.vm.Spec.VCPUCount,
|
|
MemoryMiB: sc.vm.Spec.MemoryMiB,
|
|
Jailer: s.jailerOpts(),
|
|
}
|
|
machineConfig := firecracker.MachineConfig{Drives: launchReq.Drives}
|
|
s.capHooks.contributeMachine(&machineConfig, *sc.vm, sc.image)
|
|
launchReq.Drives = machineConfig.Drives
|
|
pid, err := s.privOps().LaunchFirecracker(ctx, launchReq)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
sc.live.PID = pid
|
|
s.setVMHandles(sc.vm, *sc.live)
|
|
op.debugStage("firecracker_started", "pid", sc.live.PID)
|
|
return nil
|
|
},
|
|
undo: func(ctx context.Context, sc *startContext) error {
|
|
var errs []error
|
|
if sc.live.PID > 0 {
|
|
if err := s.net.killVMProcess(ctx, sc.live.PID); err != nil {
|
|
errs = append(errs, err)
|
|
}
|
|
}
|
|
if err := os.Remove(sc.apiSock); err != nil && !os.IsNotExist(err) {
|
|
errs = append(errs, err)
|
|
}
|
|
if err := os.Remove(sc.vm.Runtime.VSockPath); err != nil && !os.IsNotExist(err) {
|
|
errs = append(errs, err)
|
|
}
|
|
return errors.Join(errs...)
|
|
},
|
|
},
|
|
{
|
|
name: "socket_access",
|
|
attrs: []any{"api_socket", sc.apiSock},
|
|
run: func(ctx context.Context, sc *startContext) error {
|
|
return s.net.ensureSocketAccess(ctx, sc.apiSock, "firecracker api socket")
|
|
},
|
|
},
|
|
{
|
|
name: "vsock_access",
|
|
attrs: []any{"vsock_path", sc.vm.Runtime.VSockPath, "vsock_cid", sc.vm.Runtime.VSockCID},
|
|
run: func(ctx context.Context, sc *startContext) error {
|
|
return s.net.ensureSocketAccess(ctx, sc.vm.Runtime.VSockPath, "firecracker vsock socket")
|
|
},
|
|
},
|
|
{
|
|
name: "wait_vsock_agent",
|
|
createStage: "wait_vsock_agent",
|
|
createDetail: "waiting for guest vsock agent",
|
|
run: func(ctx context.Context, sc *startContext) error {
|
|
return s.net.waitForGuestVSockAgent(ctx, sc.vm.Runtime.VSockPath, vsockReadyWait)
|
|
},
|
|
},
|
|
{
|
|
name: "post_start_features",
|
|
createStage: "wait_guest_ready",
|
|
createDetail: "waiting for guest services",
|
|
run: func(ctx context.Context, sc *startContext) error {
|
|
return s.capHooks.postStart(ctx, *sc.vm, sc.image)
|
|
},
|
|
// Capability Cleanup hooks are designed to be idempotent
|
|
// (check feature-enabled flag, no-op if nothing to undo),
|
|
// so calling cleanupState here is safe whether postStart
|
|
// reached every cap or bailed midway.
|
|
undo: func(ctx context.Context, sc *startContext) error {
|
|
return s.capHooks.cleanupState(ctx, *sc.vm)
|
|
},
|
|
},
|
|
{
|
|
name: "persist",
|
|
createStage: "finalize",
|
|
createDetail: "saving vm state",
|
|
run: func(ctx context.Context, sc *startContext) error {
|
|
system.TouchNow(sc.vm)
|
|
return s.store.UpsertVM(ctx, *sc.vm)
|
|
},
|
|
},
|
|
}
|
|
}
|