firecracker: adopt firecracker-jailer for VM launch (Phase B)
Each VM's firecracker now runs inside a per-VM chroot dropped to the registered owner UID via firecracker-jailer. Closes the broad ambient- sudo escalation surface that survived Phase A: the helper still needs caps for tap/bridge/dm/loop/iptables, but the VMM itself no longer runs as root in the host root filesystem. The host helper stages each chroot up front: hard-links the kernel and (optional) initrd, mknods block-device drives + /dev/vhost-vsock, copies in the firecracker binary (jailer opens it O_RDWR so a ro bind fails with EROFS), and bind-mounts /usr/lib + /lib trees read-only so the dynamic linker can resolve. Self-binds the chroot first so the findmnt-guarded cleanup can recurse safely. AF_UNIX sun_path is 108 bytes; the chroot path easily blows past that. Daemon-side launch pre-symlinks the short request socket path to the long chroot socket before Machine.Start so the SDK's poll/connect sees the short path while the kernel resolves to the chroot socket. --new-pid-ns is intentionally disabled — jailer's PID-namespace fork makes the SDK see the parent exit and tear the API socket down too early. CapabilityBoundingSet for the helper expands to add CAP_FOWNER, CAP_KILL, CAP_MKNOD, CAP_SETGID, CAP_SETUID, CAP_SYS_CHROOT alongside the existing CAP_CHOWN/CAP_DAC_OVERRIDE/CAP_NET_ADMIN/CAP_NET_RAW/ CAP_SYS_ADMIN. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
d73efe6fbc
commit
6b543cb17f
12 changed files with 864 additions and 56 deletions
|
|
@ -11,11 +11,15 @@ import (
|
|||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"golang.org/x/sys/unix"
|
||||
|
||||
"banger/internal/firecracker"
|
||||
"banger/internal/system"
|
||||
)
|
||||
|
|
@ -271,6 +275,306 @@ func (m *Manager) Kill(ctx context.Context, pid int) error {
|
|||
return err
|
||||
}
|
||||
|
||||
// ChrootDriveSpec describes how a single drive should appear inside the
|
||||
// jailer chroot. HostPath is the host-side source (a regular file or a
|
||||
// /dev/mapper/* block device); ChrootName is the bare filename it should
|
||||
// be reachable as inside the chroot (e.g. "rootfs"). The DM block device
|
||||
// case is detected via os.Stat (S_IFBLK) — the helper mknods a matching
|
||||
// node; everything else is hard-linked.
|
||||
type ChrootDriveSpec struct {
|
||||
ChrootName string
|
||||
HostPath string
|
||||
}
|
||||
|
||||
// PrepareJailerChroot stages the chroot tree at chrootRoot for the jailer
|
||||
// to take over on launch. After this call:
|
||||
//
|
||||
// - chrootRoot exists, mode 0700, owned by uid:gid.
|
||||
// - chrootRoot/<kernel-name> is a hard link of kernelHostPath, owned uid:gid.
|
||||
// - chrootRoot/<initrd-name> is a hard link of initrdHostPath if set.
|
||||
// - For each drive: a hard link (regular file source) or a freshly
|
||||
// mknod'd block device with the source's major/minor (DM source).
|
||||
// - If wantVSock, /dev/vhost-vsock is mknod'd into the chroot so
|
||||
// firecracker can open it after chroot.
|
||||
//
|
||||
// All filesystem mutations go through runner.RunSudo when the caller isn't
|
||||
// root, so this works in dev (sudo) and system (root helper) modes alike.
|
||||
// Path components are validated by the caller (roothelper) — this helper
|
||||
// trusts them.
|
||||
func (m *Manager) PrepareJailerChroot(ctx context.Context, chrootRoot string, uid, gid int, firecrackerHostPath, kernelHostPath, kernelName, initrdHostPath, initrdName string, drives []ChrootDriveSpec, wantVSock bool) error {
|
||||
if strings.TrimSpace(chrootRoot) == "" {
|
||||
return fmt.Errorf("chroot root is required")
|
||||
}
|
||||
if err := m.sudo(ctx, "mkdir", "-p", chrootRoot); err != nil {
|
||||
return fmt.Errorf("create chroot root: %w", err)
|
||||
}
|
||||
if err := m.sudo(ctx, "chmod", "0700", chrootRoot); err != nil {
|
||||
return fmt.Errorf("chmod chroot root: %w", err)
|
||||
}
|
||||
if err := m.chown(ctx, chrootRoot, uid, gid); err != nil {
|
||||
return fmt.Errorf("chown chroot root: %w", err)
|
||||
}
|
||||
// The daemon (uid) needs to traverse the intermediate directories to reach
|
||||
// the sockets firecracker creates inside the chroot. The per-VM dir
|
||||
// (<base>/firecracker/<vmid>/) is chowned to uid so the daemon can reach
|
||||
// <vmid>/root/. The <base>/firecracker/ base and <base>/jail/ dirs get
|
||||
// world-execute (--x) so any UID can traverse through them without listing
|
||||
// their contents (the per-VM dirs are still protected by their own mode).
|
||||
vmDir := filepath.Dir(chrootRoot)
|
||||
if err := m.chown(ctx, vmDir, uid, gid); err != nil {
|
||||
return fmt.Errorf("chown vm dir: %w", err)
|
||||
}
|
||||
fcBaseDir := filepath.Dir(vmDir)
|
||||
if err := m.sudo(ctx, "chmod", "0711", fcBaseDir); err != nil {
|
||||
return fmt.Errorf("chmod firecracker base dir: %w", err)
|
||||
}
|
||||
jailBaseDir := filepath.Dir(fcBaseDir)
|
||||
if err := m.sudo(ctx, "chmod", "0711", jailBaseDir); err != nil {
|
||||
return fmt.Errorf("chmod jail base dir: %w", err)
|
||||
}
|
||||
// Order matters: hard-link the kernel + file-backed drives BEFORE
|
||||
// the self-bind below. link(2) refuses to cross mount points even
|
||||
// when the underlying superblock is the same — once chrootRoot is a
|
||||
// mount point, `ln /var/lib/.../kernel <chroot>/vmlinux` returns
|
||||
// EXDEV.
|
||||
if err := m.linkInto(ctx, chrootRoot, kernelHostPath, kernelName, uid, gid); err != nil {
|
||||
return fmt.Errorf("link kernel: %w", err)
|
||||
}
|
||||
if strings.TrimSpace(initrdHostPath) != "" {
|
||||
if err := m.linkInto(ctx, chrootRoot, initrdHostPath, initrdName, uid, gid); err != nil {
|
||||
return fmt.Errorf("link initrd: %w", err)
|
||||
}
|
||||
}
|
||||
for _, d := range drives {
|
||||
if err := m.stageDrive(ctx, chrootRoot, d, uid, gid); err != nil {
|
||||
return fmt.Errorf("stage drive %s: %w", d.ChrootName, err)
|
||||
}
|
||||
}
|
||||
if wantVSock {
|
||||
// The jailer creates /dev inside the chroot, but /dev/vhost-vsock must
|
||||
// be pre-staged so firecracker can open it after the jailer chroots.
|
||||
devDir := chrootRoot + "/dev"
|
||||
if err := m.sudo(ctx, "mkdir", "-p", devDir); err != nil {
|
||||
return fmt.Errorf("create chroot/dev: %w", err)
|
||||
}
|
||||
if err := m.chown(ctx, devDir, uid, gid); err != nil {
|
||||
return fmt.Errorf("chown chroot/dev: %w", err)
|
||||
}
|
||||
if err := m.stageDevice(ctx, chrootRoot, "dev/vhost-vsock", "/dev/vhost-vsock", uid, gid); err != nil {
|
||||
return fmt.Errorf("stage vhost-vsock: %w", err)
|
||||
}
|
||||
}
|
||||
// Bind firecracker + the host libdirs into the chroot read-only.
|
||||
// firecracker is dynamically linked (interpreter /lib64/ld-linux-*,
|
||||
// libc, libgcc), and inside the chroot ENOENT on those is reported
|
||||
// as "Failed to exec into Firecracker: No such file or directory" —
|
||||
// the kernel's misleading ENOENT-for-missing-interpreter error.
|
||||
//
|
||||
// Done last so the link/mknod steps above don't have to cross the
|
||||
// self-bind mount boundary (link(2) returns EXDEV at mount edges).
|
||||
// Self-bind first so CleanupJailerChroot's `umount -lR` can recurse
|
||||
// from chrootRoot itself; --make-private blocks propagation back to
|
||||
// the host mount namespace.
|
||||
// firecracker is copied (not bind-mounted) because jailer opens the
|
||||
// binary O_RDWR — apparently to seal it or rewrite something — and
|
||||
// fails with EROFS on a ro-bind.
|
||||
chrootFC := chrootRoot + "/" + filepath.Base(firecrackerHostPath)
|
||||
if err := m.sudo(ctx, "cp", "-f", firecrackerHostPath, chrootFC); err != nil {
|
||||
return fmt.Errorf("copy firecracker into chroot: %w", err)
|
||||
}
|
||||
if err := m.sudo(ctx, "chmod", "0755", chrootFC); err != nil {
|
||||
return fmt.Errorf("chmod firecracker in chroot: %w", err)
|
||||
}
|
||||
if err := m.chown(ctx, chrootFC, uid, gid); err != nil {
|
||||
return fmt.Errorf("chown firecracker in chroot: %w", err)
|
||||
}
|
||||
if err := m.sudo(ctx, "mount", "--bind", chrootRoot, chrootRoot); err != nil {
|
||||
return fmt.Errorf("self-bind chroot: %w", err)
|
||||
}
|
||||
// Remount without nosuid: the helper unit's ReadWritePaths binding marks
|
||||
// /var/lib/banger nosuid, and bind mounts inherit that flag. The jailer
|
||||
// needs to exec /firecracker as UID 1000, which the kernel denies on a
|
||||
// nosuid mount when NoNewPrivileges is set on the unit.
|
||||
if err := m.sudo(ctx, "mount", "-o", "remount,bind,suid", chrootRoot, chrootRoot); err != nil {
|
||||
return fmt.Errorf("remount chroot suid: %w", err)
|
||||
}
|
||||
if err := m.sudo(ctx, "mount", "--make-private", chrootRoot); err != nil {
|
||||
return fmt.Errorf("make-private chroot: %w", err)
|
||||
}
|
||||
// Pre-create /usr with world-traversable permissions. UMask=0077 on the
|
||||
// helper unit causes plain mkdir to produce 0700 dirs; UID 1000 must be
|
||||
// able to traverse /usr/ to reach the dynamic linker via lib64 → usr/lib.
|
||||
if err := m.sudo(ctx, "install", "-d", "-m", "0755", chrootRoot+"/usr"); err != nil {
|
||||
return fmt.Errorf("create chroot/usr: %w", err)
|
||||
}
|
||||
// Bind real libdirs and replicate the host's compat symlinks
|
||||
// (/lib64 → /usr/lib, etc) inside the chroot so firecracker's
|
||||
// PT_INTERP path (/lib64/ld-linux-*) resolves to the bound libs.
|
||||
for _, libDir := range []string{"/usr/lib", "/usr/lib64", "/lib", "/lib64"} {
|
||||
info, err := os.Lstat(libDir)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
target := chrootRoot + libDir
|
||||
if info.Mode()&os.ModeSymlink != 0 {
|
||||
link, err := os.Readlink(libDir)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if err := m.sudo(ctx, "ln", "-sfn", link, target); err != nil {
|
||||
return fmt.Errorf("symlink %s -> %s: %w", target, link, err)
|
||||
}
|
||||
continue
|
||||
}
|
||||
if !info.IsDir() {
|
||||
continue
|
||||
}
|
||||
if err := m.bindDir(ctx, libDir, target, true); err != nil {
|
||||
return fmt.Errorf("bind %s: %w", libDir, err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// CleanupJailerChroot tears down a chroot built by PrepareJailerChroot:
|
||||
// lazy-recursive umount of every mount under (or at) chrootRoot, then a
|
||||
// findmnt-guarded `rm -rf`. The guard is load-bearing: if any bind mount
|
||||
// remained, `rm -rf` would descend into the bind source (e.g. /usr/lib)
|
||||
// and start deleting host files. The umount runs `-l` (lazy) so an in-use
|
||||
// bind point still gets detached from the namespace; the guarded check
|
||||
// then catches the rare case where detachment didn't happen.
|
||||
func (m *Manager) CleanupJailerChroot(ctx context.Context, chrootRoot string) error {
|
||||
if strings.TrimSpace(chrootRoot) == "" {
|
||||
return nil
|
||||
}
|
||||
if _, err := os.Stat(chrootRoot); os.IsNotExist(err) {
|
||||
return nil
|
||||
}
|
||||
// Best-effort umount: for chroots that were never bind-mounted (a
|
||||
// stale install pre-bind-mount work, say) this fails — that's fine,
|
||||
// the findmnt guard below is what enforces safety.
|
||||
_ = m.sudoIgnore(ctx, "umount", "--recursive", "--lazy", chrootRoot)
|
||||
if mounts, err := m.mountsUnder(ctx, chrootRoot); err != nil {
|
||||
return fmt.Errorf("inspect chroot mounts: %w", err)
|
||||
} else if len(mounts) > 0 {
|
||||
return fmt.Errorf("refusing to rm -rf %q: still has %d mount(s): %v", chrootRoot, len(mounts), mounts)
|
||||
}
|
||||
return m.sudo(ctx, "rm", "-rf", "--", chrootRoot)
|
||||
}
|
||||
|
||||
func (m *Manager) sudoIgnore(ctx context.Context, name string, args ...string) error {
|
||||
err := m.sudo(ctx, name, args...)
|
||||
return err
|
||||
}
|
||||
|
||||
func (m *Manager) bindFile(ctx context.Context, source, target string, readOnly bool) error {
|
||||
if err := m.sudo(ctx, "install", "-D", "-m", "0644", "/dev/null", target); err != nil {
|
||||
return fmt.Errorf("create bind target file: %w", err)
|
||||
}
|
||||
return m.bindMount(ctx, source, target, readOnly)
|
||||
}
|
||||
|
||||
func (m *Manager) bindDir(ctx context.Context, source, target string, readOnly bool) error {
|
||||
if err := m.sudo(ctx, "mkdir", "-p", target); err != nil {
|
||||
return fmt.Errorf("create bind target dir: %w", err)
|
||||
}
|
||||
return m.bindMount(ctx, source, target, readOnly)
|
||||
}
|
||||
|
||||
func (m *Manager) bindMount(ctx context.Context, source, target string, readOnly bool) error {
|
||||
if err := m.sudo(ctx, "mount", "--bind", source, target); err != nil {
|
||||
return err
|
||||
}
|
||||
if !readOnly {
|
||||
return nil
|
||||
}
|
||||
// Single-step ro bind isn't honored by all kernels — the bind happens
|
||||
// rw and the ro flag is silently ignored. Remount makes it stick.
|
||||
return m.sudo(ctx, "mount", "-o", "remount,bind,ro", target)
|
||||
}
|
||||
|
||||
// mountsUnder returns the list of mount targets at or under chrootRoot.
|
||||
// findmnt's output is one path per line; an empty list means no leftovers.
|
||||
func (m *Manager) mountsUnder(ctx context.Context, chrootRoot string) ([]string, error) {
|
||||
out, err := m.runner.Run(ctx, "findmnt", "--output", "TARGET", "--list", "--noheadings")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var mounts []string
|
||||
prefix := chrootRoot + string(os.PathSeparator)
|
||||
for _, line := range strings.Split(string(out), "\n") {
|
||||
t := strings.TrimSpace(line)
|
||||
if t == chrootRoot || strings.HasPrefix(t, prefix) {
|
||||
mounts = append(mounts, t)
|
||||
}
|
||||
}
|
||||
return mounts, nil
|
||||
}
|
||||
|
||||
func (m *Manager) stageDrive(ctx context.Context, chrootRoot string, d ChrootDriveSpec, uid, gid int) error {
|
||||
info, err := os.Stat(d.HostPath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if info.Mode()&os.ModeDevice != 0 {
|
||||
stat, ok := info.Sys().(*syscall.Stat_t)
|
||||
if !ok {
|
||||
return fmt.Errorf("stat %s: cannot read device numbers", d.HostPath)
|
||||
}
|
||||
major := unix.Major(stat.Rdev)
|
||||
minor := unix.Minor(stat.Rdev)
|
||||
return m.mknodBlock(ctx, chrootRoot, d.ChrootName, major, minor, uid, gid)
|
||||
}
|
||||
return m.linkInto(ctx, chrootRoot, d.HostPath, d.ChrootName, uid, gid)
|
||||
}
|
||||
|
||||
func (m *Manager) stageDevice(ctx context.Context, chrootRoot, chrootName, hostDevice string, uid, gid int) error {
|
||||
info, err := os.Stat(hostDevice)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
stat, ok := info.Sys().(*syscall.Stat_t)
|
||||
if !ok {
|
||||
return fmt.Errorf("stat %s: cannot read device numbers", hostDevice)
|
||||
}
|
||||
major := unix.Major(stat.Rdev)
|
||||
minor := unix.Minor(stat.Rdev)
|
||||
target := chrootRoot + "/" + chrootName
|
||||
if err := m.sudo(ctx, "mknod", "-m", "0660", target, "c", strconv.FormatUint(uint64(major), 10), strconv.FormatUint(uint64(minor), 10)); err != nil {
|
||||
return err
|
||||
}
|
||||
return m.chown(ctx, target, uid, gid)
|
||||
}
|
||||
|
||||
func (m *Manager) mknodBlock(ctx context.Context, chrootRoot, name string, major, minor uint32, uid, gid int) error {
|
||||
target := chrootRoot + "/" + name
|
||||
if err := m.sudo(ctx, "mknod", "-m", "0660", target, "b", strconv.FormatUint(uint64(major), 10), strconv.FormatUint(uint64(minor), 10)); err != nil {
|
||||
return err
|
||||
}
|
||||
return m.chown(ctx, target, uid, gid)
|
||||
}
|
||||
|
||||
func (m *Manager) linkInto(ctx context.Context, chrootRoot, source, name string, uid, gid int) error {
|
||||
target := chrootRoot + "/" + name
|
||||
if err := m.sudo(ctx, "ln", "-f", source, target); err != nil {
|
||||
return err
|
||||
}
|
||||
return m.chown(ctx, target, uid, gid)
|
||||
}
|
||||
|
||||
func (m *Manager) chown(ctx context.Context, target string, uid, gid int) error {
|
||||
return m.sudo(ctx, "chown", fmt.Sprintf("%d:%d", uid, gid), target)
|
||||
}
|
||||
|
||||
func (m *Manager) sudo(ctx context.Context, name string, args ...string) error {
|
||||
if os.Geteuid() == 0 {
|
||||
_, err := m.runner.Run(ctx, name, args...)
|
||||
return err
|
||||
}
|
||||
_, err := m.runner.RunSudo(ctx, append([]string{name}, args...)...)
|
||||
return err
|
||||
}
|
||||
|
||||
func waitForPath(ctx context.Context, path string, timeout time.Duration, label string) error {
|
||||
return pollPath(ctx, path, timeout, 100*time.Millisecond, label)
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue