Each VM's firecracker now runs inside a per-VM chroot dropped to the registered owner UID via firecracker-jailer. Closes the broad ambient- sudo escalation surface that survived Phase A: the helper still needs caps for tap/bridge/dm/loop/iptables, but the VMM itself no longer runs as root in the host root filesystem. The host helper stages each chroot up front: hard-links the kernel and (optional) initrd, mknods block-device drives + /dev/vhost-vsock, copies in the firecracker binary (jailer opens it O_RDWR so a ro bind fails with EROFS), and bind-mounts /usr/lib + /lib trees read-only so the dynamic linker can resolve. Self-binds the chroot first so the findmnt-guarded cleanup can recurse safely. AF_UNIX sun_path is 108 bytes; the chroot path easily blows past that. Daemon-side launch pre-symlinks the short request socket path to the long chroot socket before Machine.Start so the SDK's poll/connect sees the short path while the kernel resolves to the chroot socket. --new-pid-ns is intentionally disabled — jailer's PID-namespace fork makes the SDK see the parent exit and tear the API socket down too early. CapabilityBoundingSet for the helper expands to add CAP_FOWNER, CAP_KILL, CAP_MKNOD, CAP_SETGID, CAP_SETUID, CAP_SYS_CHROOT alongside the existing CAP_CHOWN/CAP_DAC_OVERRIDE/CAP_NET_ADMIN/CAP_NET_RAW/ CAP_SYS_ADMIN. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
215 lines
6.9 KiB
Go
215 lines
6.9 KiB
Go
package daemon
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"os"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
"banger/internal/daemon/fcproc"
|
|
"banger/internal/firecracker"
|
|
"banger/internal/model"
|
|
"banger/internal/namegen"
|
|
"banger/internal/system"
|
|
"banger/internal/vmdns"
|
|
)
|
|
|
|
// Cross-service constants. Kept in vm.go because both lifecycle
|
|
// (VMService) and networking (HostNetwork) reference them; moving
|
|
// them to either owner would read as a layering violation.
|
|
var (
|
|
errWaitForExitTimeout = fcproc.ErrWaitForExitTimeout
|
|
gracefulShutdownWait = 10 * time.Second
|
|
vsockReadyWait = 30 * time.Second
|
|
vsockReadyPoll = 200 * time.Millisecond
|
|
)
|
|
|
|
// rebuildDNS enumerates live VMs and republishes the DNS record set.
|
|
// Lives on VMService because "alive" is a VM-state concern that
|
|
// HostNetwork shouldn't need to reach into. VMService orchestrates:
|
|
// VM list from the store, alive filter, hand the resulting map to
|
|
// HostNetwork.replaceDNS.
|
|
func (s *VMService) rebuildDNS(ctx context.Context) error {
|
|
if s.net == nil {
|
|
return nil
|
|
}
|
|
vms, err := s.store.ListVMs(ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
records := make(map[string]string)
|
|
for _, vm := range vms {
|
|
if !s.vmAlive(vm) {
|
|
continue
|
|
}
|
|
if strings.TrimSpace(vm.Runtime.GuestIP) == "" {
|
|
continue
|
|
}
|
|
records[vmdns.RecordName(vm.Name)] = vm.Runtime.GuestIP
|
|
}
|
|
return s.net.replaceDNS(records)
|
|
}
|
|
|
|
func persistRuntimeTeardownState(vm *model.VMRecord, h model.VMHandles) {
|
|
if vm == nil {
|
|
return
|
|
}
|
|
vm.Runtime.TapDevice = h.TapDevice
|
|
vm.Runtime.BaseLoop = h.BaseLoop
|
|
vm.Runtime.COWLoop = h.COWLoop
|
|
vm.Runtime.DMName = h.DMName
|
|
vm.Runtime.DMDev = h.DMDev
|
|
}
|
|
|
|
func clearRuntimeTeardownState(vm *model.VMRecord) {
|
|
if vm == nil {
|
|
return
|
|
}
|
|
vm.Runtime.TapDevice = ""
|
|
vm.Runtime.BaseLoop = ""
|
|
vm.Runtime.COWLoop = ""
|
|
vm.Runtime.DMName = ""
|
|
vm.Runtime.DMDev = ""
|
|
}
|
|
|
|
func teardownHandlesForCleanup(vm model.VMRecord, live model.VMHandles) model.VMHandles {
|
|
recovered := live
|
|
if strings.TrimSpace(recovered.TapDevice) == "" {
|
|
recovered.TapDevice = strings.TrimSpace(vm.Runtime.TapDevice)
|
|
}
|
|
if strings.TrimSpace(recovered.BaseLoop) == "" {
|
|
recovered.BaseLoop = strings.TrimSpace(vm.Runtime.BaseLoop)
|
|
}
|
|
if strings.TrimSpace(recovered.COWLoop) == "" {
|
|
recovered.COWLoop = strings.TrimSpace(vm.Runtime.COWLoop)
|
|
}
|
|
if strings.TrimSpace(recovered.DMName) == "" {
|
|
recovered.DMName = strings.TrimSpace(vm.Runtime.DMName)
|
|
}
|
|
if strings.TrimSpace(recovered.DMDev) == "" {
|
|
recovered.DMDev = strings.TrimSpace(vm.Runtime.DMDev)
|
|
}
|
|
return recovered
|
|
}
|
|
|
|
// cleanupRuntime tears down the host-side state for a VM: firecracker
|
|
// process, DM snapshot, capabilities, tap, sockets. Lives on VMService
|
|
// because it reaches into handles (VMService-owned); the capability
|
|
// teardown goes through the capHooks seam to keep Daemon out of the
|
|
// dependency chain.
|
|
//
|
|
// Idempotency contract: every step runs even when an earlier step
|
|
// fails, and the per-step errors are joined into the returned value.
|
|
// A waitForExit timeout (firecracker refused to die) used to early-
|
|
// return, leaving DM/feature/tap state stranded on the host across
|
|
// daemon restarts. With collect-and-continue the kernel teardowns
|
|
// still attempt; in the worst case (firecracker actually still alive)
|
|
// they fail with EBUSY which is also surfaced via errors.Join — no
|
|
// damage, but the operator sees the full picture.
|
|
func (s *VMService) cleanupRuntime(ctx context.Context, vm model.VMRecord, preserveDisks bool) error {
|
|
if s.logger != nil {
|
|
s.logger.Debug("cleanup runtime", append(vmLogAttrs(vm), "preserve_disks", preserveDisks)...)
|
|
}
|
|
h := s.vmHandles(vm.ID)
|
|
cleanupPID := h.PID
|
|
if vm.Runtime.APISockPath != "" {
|
|
if pid, err := s.net.findFirecrackerPID(ctx, vm.Runtime.APISockPath); err == nil && pid > 0 {
|
|
cleanupPID = pid
|
|
}
|
|
}
|
|
var waitErr error
|
|
if cleanupPID > 0 && system.ProcessRunning(cleanupPID, vm.Runtime.APISockPath) {
|
|
_ = s.net.killVMProcess(ctx, cleanupPID)
|
|
waitErr = s.net.waitForExit(ctx, cleanupPID, vm.Runtime.APISockPath, 30*time.Second)
|
|
if waitErr != nil && s.logger != nil {
|
|
s.logger.Warn("cleanup wait_for_exit failed; continuing teardown", append(vmLogAttrs(vm), "pid", cleanupPID, "error", waitErr.Error())...)
|
|
}
|
|
}
|
|
handles := teardownHandlesForCleanup(vm, h)
|
|
snapshotErr := s.net.cleanupDMSnapshot(ctx, dmSnapshotHandles{
|
|
BaseLoop: handles.BaseLoop,
|
|
COWLoop: handles.COWLoop,
|
|
DMName: handles.DMName,
|
|
DMDev: handles.DMDev,
|
|
})
|
|
featureErr := s.capHooks.cleanupState(ctx, vm)
|
|
var tapErr error
|
|
// Prefer the handle cache (fresh from startVMLocked), but fall
|
|
// back to the VMRuntime mirrors so restart-time cleanup still works
|
|
// when handles.json is missing or corrupt.
|
|
tap := handles.TapDevice
|
|
if tap != "" {
|
|
tapErr = s.net.releaseTap(ctx, tap)
|
|
}
|
|
if vm.Runtime.APISockPath != "" {
|
|
_ = os.Remove(vm.Runtime.APISockPath)
|
|
}
|
|
if vm.Runtime.VSockPath != "" {
|
|
_ = os.Remove(vm.Runtime.VSockPath)
|
|
}
|
|
// Remove the jailer chroot tree (kernel hard-links, mknod'd device
|
|
// nodes, the chroot root itself). Skipped silently when the jailer
|
|
// is disabled or the chroot was never created. We intentionally
|
|
// don't gate on JailerEnabled today — old VMs created before the
|
|
// flag flipped on still need their chroots removed if any exist.
|
|
jailerErr := s.cleanupJailerChroot(ctx, vm)
|
|
// The handles are only meaningful while the kernel objects exist;
|
|
// dropping them here keeps the cache in sync with reality even
|
|
// when the caller forgets to call clearVMHandles explicitly.
|
|
s.clearVMHandles(vm)
|
|
if !preserveDisks && vm.Runtime.VMDir != "" {
|
|
return errors.Join(waitErr, snapshotErr, featureErr, tapErr, jailerErr, os.RemoveAll(vm.Runtime.VMDir))
|
|
}
|
|
return errors.Join(waitErr, snapshotErr, featureErr, tapErr, jailerErr)
|
|
}
|
|
|
|
// cleanupJailerChroot removes the per-VM chroot tree if it exists. Returns
|
|
// nil silently when the jailer was never enabled or the chroot path can't
|
|
// be computed (no JailerChrootBase configured).
|
|
func (s *VMService) cleanupJailerChroot(ctx context.Context, vm model.VMRecord) error {
|
|
base := strings.TrimSpace(s.config.JailerChrootBase)
|
|
if base == "" {
|
|
return nil
|
|
}
|
|
chrootRoot := firecracker.JailerChrootRoot(base, vm.ID)
|
|
if _, err := os.Stat(chrootRoot); os.IsNotExist(err) {
|
|
return nil
|
|
}
|
|
return s.privOps().CleanupJailerChroot(ctx, chrootRoot)
|
|
}
|
|
|
|
func (s *VMService) generateName(ctx context.Context) (string, error) {
|
|
_ = ctx
|
|
if name := strings.TrimSpace(namegen.Generate()); name != "" {
|
|
return name, nil
|
|
}
|
|
return "vm-" + strconv.FormatInt(time.Now().Unix(), 10), nil
|
|
}
|
|
|
|
func bridgePrefix(bridgeIP string) string {
|
|
parts := strings.Split(bridgeIP, ".")
|
|
if len(parts) < 3 {
|
|
return bridgeIP
|
|
}
|
|
return strings.Join(parts[:3], ".")
|
|
}
|
|
|
|
func optionalIntOrDefault(value *int, fallback int) int {
|
|
if value != nil {
|
|
return *value
|
|
}
|
|
return fallback
|
|
}
|
|
|
|
func validateOptionalPositiveSetting(label string, value *int) error {
|
|
if value == nil {
|
|
return nil
|
|
}
|
|
if *value <= 0 {
|
|
return fmt.Errorf("%s must be a positive integer", label)
|
|
}
|
|
return nil
|
|
}
|