firecracker: adopt firecracker-jailer for VM launch (Phase B)
Each VM's firecracker now runs inside a per-VM chroot dropped to the registered owner UID via firecracker-jailer. Closes the broad ambient- sudo escalation surface that survived Phase A: the helper still needs caps for tap/bridge/dm/loop/iptables, but the VMM itself no longer runs as root in the host root filesystem. The host helper stages each chroot up front: hard-links the kernel and (optional) initrd, mknods block-device drives + /dev/vhost-vsock, copies in the firecracker binary (jailer opens it O_RDWR so a ro bind fails with EROFS), and bind-mounts /usr/lib + /lib trees read-only so the dynamic linker can resolve. Self-binds the chroot first so the findmnt-guarded cleanup can recurse safely. AF_UNIX sun_path is 108 bytes; the chroot path easily blows past that. Daemon-side launch pre-symlinks the short request socket path to the long chroot socket before Machine.Start so the SDK's poll/connect sees the short path while the kernel resolves to the chroot socket. --new-pid-ns is intentionally disabled — jailer's PID-namespace fork makes the SDK see the parent exit and tear the API socket down too early. CapabilityBoundingSet for the helper expands to add CAP_FOWNER, CAP_KILL, CAP_MKNOD, CAP_SETGID, CAP_SETUID, CAP_SYS_CHROOT alongside the existing CAP_CHOWN/CAP_DAC_OVERRIDE/CAP_NET_ADMIN/CAP_NET_RAW/ CAP_SYS_ADMIN. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
d73efe6fbc
commit
6b543cb17f
12 changed files with 864 additions and 56 deletions
|
|
@ -10,6 +10,7 @@ import (
|
|||
"time"
|
||||
|
||||
"banger/internal/daemon/fcproc"
|
||||
"banger/internal/firecracker"
|
||||
"banger/internal/model"
|
||||
"banger/internal/namegen"
|
||||
"banger/internal/system"
|
||||
|
|
@ -149,14 +150,35 @@ func (s *VMService) cleanupRuntime(ctx context.Context, vm model.VMRecord, prese
|
|||
if vm.Runtime.VSockPath != "" {
|
||||
_ = os.Remove(vm.Runtime.VSockPath)
|
||||
}
|
||||
// Remove the jailer chroot tree (kernel hard-links, mknod'd device
|
||||
// nodes, the chroot root itself). Skipped silently when the jailer
|
||||
// is disabled or the chroot was never created. We intentionally
|
||||
// don't gate on JailerEnabled today — old VMs created before the
|
||||
// flag flipped on still need their chroots removed if any exist.
|
||||
jailerErr := s.cleanupJailerChroot(ctx, vm)
|
||||
// The handles are only meaningful while the kernel objects exist;
|
||||
// dropping them here keeps the cache in sync with reality even
|
||||
// when the caller forgets to call clearVMHandles explicitly.
|
||||
s.clearVMHandles(vm)
|
||||
if !preserveDisks && vm.Runtime.VMDir != "" {
|
||||
return errors.Join(waitErr, snapshotErr, featureErr, tapErr, os.RemoveAll(vm.Runtime.VMDir))
|
||||
return errors.Join(waitErr, snapshotErr, featureErr, tapErr, jailerErr, os.RemoveAll(vm.Runtime.VMDir))
|
||||
}
|
||||
return errors.Join(waitErr, snapshotErr, featureErr, tapErr)
|
||||
return errors.Join(waitErr, snapshotErr, featureErr, tapErr, jailerErr)
|
||||
}
|
||||
|
||||
// cleanupJailerChroot removes the per-VM chroot tree if it exists. Returns
|
||||
// nil silently when the jailer was never enabled or the chroot path can't
|
||||
// be computed (no JailerChrootBase configured).
|
||||
func (s *VMService) cleanupJailerChroot(ctx context.Context, vm model.VMRecord) error {
|
||||
base := strings.TrimSpace(s.config.JailerChrootBase)
|
||||
if base == "" {
|
||||
return nil
|
||||
}
|
||||
chrootRoot := firecracker.JailerChrootRoot(base, vm.ID)
|
||||
if _, err := os.Stat(chrootRoot); os.IsNotExist(err) {
|
||||
return nil
|
||||
}
|
||||
return s.privOps().CleanupJailerChroot(ctx, chrootRoot)
|
||||
}
|
||||
|
||||
func (s *VMService) generateName(ctx context.Context) (string, error) {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue