banger/internal/daemon/vm.go
Thales Maciel 6b543cb17f
firecracker: adopt firecracker-jailer for VM launch (Phase B)
Each VM's firecracker now runs inside a per-VM chroot dropped to the
registered owner UID via firecracker-jailer. Closes the broad ambient-
sudo escalation surface that survived Phase A: the helper still needs
caps for tap/bridge/dm/loop/iptables, but the VMM itself no longer
runs as root in the host root filesystem.

The host helper stages each chroot up front: hard-links the kernel
and (optional) initrd, mknods block-device drives + /dev/vhost-vsock,
copies in the firecracker binary (jailer opens it O_RDWR so a ro bind
fails with EROFS), and bind-mounts /usr/lib + /lib trees read-only so
the dynamic linker can resolve. Self-binds the chroot first so the
findmnt-guarded cleanup can recurse safely.

AF_UNIX sun_path is 108 bytes; the chroot path easily blows past that.
Daemon-side launch pre-symlinks the short request socket path to the
long chroot socket before Machine.Start so the SDK's poll/connect
sees the short path while the kernel resolves to the chroot socket.
--new-pid-ns is intentionally disabled — jailer's PID-namespace fork
makes the SDK see the parent exit and tear the API socket down too
early.

CapabilityBoundingSet for the helper expands to add CAP_FOWNER,
CAP_KILL, CAP_MKNOD, CAP_SETGID, CAP_SETUID, CAP_SYS_CHROOT alongside
the existing CAP_CHOWN/CAP_DAC_OVERRIDE/CAP_NET_ADMIN/CAP_NET_RAW/
CAP_SYS_ADMIN.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-28 14:38:07 -03:00

215 lines
6.9 KiB
Go

package daemon
import (
"context"
"errors"
"fmt"
"os"
"strconv"
"strings"
"time"
"banger/internal/daemon/fcproc"
"banger/internal/firecracker"
"banger/internal/model"
"banger/internal/namegen"
"banger/internal/system"
"banger/internal/vmdns"
)
// Cross-service constants. Kept in vm.go because both lifecycle
// (VMService) and networking (HostNetwork) reference them; moving
// them to either owner would read as a layering violation.
var (
errWaitForExitTimeout = fcproc.ErrWaitForExitTimeout
gracefulShutdownWait = 10 * time.Second
vsockReadyWait = 30 * time.Second
vsockReadyPoll = 200 * time.Millisecond
)
// rebuildDNS enumerates live VMs and republishes the DNS record set.
// Lives on VMService because "alive" is a VM-state concern that
// HostNetwork shouldn't need to reach into. VMService orchestrates:
// VM list from the store, alive filter, hand the resulting map to
// HostNetwork.replaceDNS.
func (s *VMService) rebuildDNS(ctx context.Context) error {
if s.net == nil {
return nil
}
vms, err := s.store.ListVMs(ctx)
if err != nil {
return err
}
records := make(map[string]string)
for _, vm := range vms {
if !s.vmAlive(vm) {
continue
}
if strings.TrimSpace(vm.Runtime.GuestIP) == "" {
continue
}
records[vmdns.RecordName(vm.Name)] = vm.Runtime.GuestIP
}
return s.net.replaceDNS(records)
}
func persistRuntimeTeardownState(vm *model.VMRecord, h model.VMHandles) {
if vm == nil {
return
}
vm.Runtime.TapDevice = h.TapDevice
vm.Runtime.BaseLoop = h.BaseLoop
vm.Runtime.COWLoop = h.COWLoop
vm.Runtime.DMName = h.DMName
vm.Runtime.DMDev = h.DMDev
}
func clearRuntimeTeardownState(vm *model.VMRecord) {
if vm == nil {
return
}
vm.Runtime.TapDevice = ""
vm.Runtime.BaseLoop = ""
vm.Runtime.COWLoop = ""
vm.Runtime.DMName = ""
vm.Runtime.DMDev = ""
}
func teardownHandlesForCleanup(vm model.VMRecord, live model.VMHandles) model.VMHandles {
recovered := live
if strings.TrimSpace(recovered.TapDevice) == "" {
recovered.TapDevice = strings.TrimSpace(vm.Runtime.TapDevice)
}
if strings.TrimSpace(recovered.BaseLoop) == "" {
recovered.BaseLoop = strings.TrimSpace(vm.Runtime.BaseLoop)
}
if strings.TrimSpace(recovered.COWLoop) == "" {
recovered.COWLoop = strings.TrimSpace(vm.Runtime.COWLoop)
}
if strings.TrimSpace(recovered.DMName) == "" {
recovered.DMName = strings.TrimSpace(vm.Runtime.DMName)
}
if strings.TrimSpace(recovered.DMDev) == "" {
recovered.DMDev = strings.TrimSpace(vm.Runtime.DMDev)
}
return recovered
}
// cleanupRuntime tears down the host-side state for a VM: firecracker
// process, DM snapshot, capabilities, tap, sockets. Lives on VMService
// because it reaches into handles (VMService-owned); the capability
// teardown goes through the capHooks seam to keep Daemon out of the
// dependency chain.
//
// Idempotency contract: every step runs even when an earlier step
// fails, and the per-step errors are joined into the returned value.
// A waitForExit timeout (firecracker refused to die) used to early-
// return, leaving DM/feature/tap state stranded on the host across
// daemon restarts. With collect-and-continue the kernel teardowns
// still attempt; in the worst case (firecracker actually still alive)
// they fail with EBUSY which is also surfaced via errors.Join — no
// damage, but the operator sees the full picture.
func (s *VMService) cleanupRuntime(ctx context.Context, vm model.VMRecord, preserveDisks bool) error {
if s.logger != nil {
s.logger.Debug("cleanup runtime", append(vmLogAttrs(vm), "preserve_disks", preserveDisks)...)
}
h := s.vmHandles(vm.ID)
cleanupPID := h.PID
if vm.Runtime.APISockPath != "" {
if pid, err := s.net.findFirecrackerPID(ctx, vm.Runtime.APISockPath); err == nil && pid > 0 {
cleanupPID = pid
}
}
var waitErr error
if cleanupPID > 0 && system.ProcessRunning(cleanupPID, vm.Runtime.APISockPath) {
_ = s.net.killVMProcess(ctx, cleanupPID)
waitErr = s.net.waitForExit(ctx, cleanupPID, vm.Runtime.APISockPath, 30*time.Second)
if waitErr != nil && s.logger != nil {
s.logger.Warn("cleanup wait_for_exit failed; continuing teardown", append(vmLogAttrs(vm), "pid", cleanupPID, "error", waitErr.Error())...)
}
}
handles := teardownHandlesForCleanup(vm, h)
snapshotErr := s.net.cleanupDMSnapshot(ctx, dmSnapshotHandles{
BaseLoop: handles.BaseLoop,
COWLoop: handles.COWLoop,
DMName: handles.DMName,
DMDev: handles.DMDev,
})
featureErr := s.capHooks.cleanupState(ctx, vm)
var tapErr error
// Prefer the handle cache (fresh from startVMLocked), but fall
// back to the VMRuntime mirrors so restart-time cleanup still works
// when handles.json is missing or corrupt.
tap := handles.TapDevice
if tap != "" {
tapErr = s.net.releaseTap(ctx, tap)
}
if vm.Runtime.APISockPath != "" {
_ = os.Remove(vm.Runtime.APISockPath)
}
if vm.Runtime.VSockPath != "" {
_ = os.Remove(vm.Runtime.VSockPath)
}
// Remove the jailer chroot tree (kernel hard-links, mknod'd device
// nodes, the chroot root itself). Skipped silently when the jailer
// is disabled or the chroot was never created. We intentionally
// don't gate on JailerEnabled today — old VMs created before the
// flag flipped on still need their chroots removed if any exist.
jailerErr := s.cleanupJailerChroot(ctx, vm)
// The handles are only meaningful while the kernel objects exist;
// dropping them here keeps the cache in sync with reality even
// when the caller forgets to call clearVMHandles explicitly.
s.clearVMHandles(vm)
if !preserveDisks && vm.Runtime.VMDir != "" {
return errors.Join(waitErr, snapshotErr, featureErr, tapErr, jailerErr, os.RemoveAll(vm.Runtime.VMDir))
}
return errors.Join(waitErr, snapshotErr, featureErr, tapErr, jailerErr)
}
// cleanupJailerChroot removes the per-VM chroot tree if it exists. Returns
// nil silently when the jailer was never enabled or the chroot path can't
// be computed (no JailerChrootBase configured).
func (s *VMService) cleanupJailerChroot(ctx context.Context, vm model.VMRecord) error {
base := strings.TrimSpace(s.config.JailerChrootBase)
if base == "" {
return nil
}
chrootRoot := firecracker.JailerChrootRoot(base, vm.ID)
if _, err := os.Stat(chrootRoot); os.IsNotExist(err) {
return nil
}
return s.privOps().CleanupJailerChroot(ctx, chrootRoot)
}
func (s *VMService) generateName(ctx context.Context) (string, error) {
_ = ctx
if name := strings.TrimSpace(namegen.Generate()); name != "" {
return name, nil
}
return "vm-" + strconv.FormatInt(time.Now().Unix(), 10), nil
}
func bridgePrefix(bridgeIP string) string {
parts := strings.Split(bridgeIP, ".")
if len(parts) < 3 {
return bridgeIP
}
return strings.Join(parts[:3], ".")
}
func optionalIntOrDefault(value *int, fallback int) int {
if value != nil {
return *value
}
return fallback
}
func validateOptionalPositiveSetting(label string, value *int) error {
if value == nil {
return nil
}
if *value <= 0 {
return fmt.Errorf("%s must be a positive integer", label)
}
return nil
}