firecracker: adopt firecracker-jailer for VM launch (Phase B)
Each VM's firecracker now runs inside a per-VM chroot dropped to the registered owner UID via firecracker-jailer. Closes the broad ambient- sudo escalation surface that survived Phase A: the helper still needs caps for tap/bridge/dm/loop/iptables, but the VMM itself no longer runs as root in the host root filesystem. The host helper stages each chroot up front: hard-links the kernel and (optional) initrd, mknods block-device drives + /dev/vhost-vsock, copies in the firecracker binary (jailer opens it O_RDWR so a ro bind fails with EROFS), and bind-mounts /usr/lib + /lib trees read-only so the dynamic linker can resolve. Self-binds the chroot first so the findmnt-guarded cleanup can recurse safely. AF_UNIX sun_path is 108 bytes; the chroot path easily blows past that. Daemon-side launch pre-symlinks the short request socket path to the long chroot socket before Machine.Start so the SDK's poll/connect sees the short path while the kernel resolves to the chroot socket. --new-pid-ns is intentionally disabled — jailer's PID-namespace fork makes the SDK see the parent exit and tear the API socket down too early. CapabilityBoundingSet for the helper expands to add CAP_FOWNER, CAP_KILL, CAP_MKNOD, CAP_SETGID, CAP_SETUID, CAP_SYS_CHROOT alongside the existing CAP_CHOWN/CAP_DAC_OVERRIDE/CAP_NET_ADMIN/CAP_NET_RAW/ CAP_SYS_ADMIN. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
d73efe6fbc
commit
6b543cb17f
12 changed files with 864 additions and 56 deletions
|
|
@ -367,7 +367,7 @@ func renderRootHelperSystemdUnit() string {
|
|||
"LockPersonality=yes",
|
||||
"SystemCallArchitectures=native",
|
||||
"RestrictAddressFamilies=AF_UNIX AF_INET AF_INET6 AF_NETLINK AF_VSOCK",
|
||||
"CapabilityBoundingSet=CAP_CHOWN CAP_DAC_OVERRIDE CAP_NET_ADMIN CAP_NET_RAW CAP_SYS_ADMIN",
|
||||
"CapabilityBoundingSet=CAP_CHOWN CAP_DAC_OVERRIDE CAP_FOWNER CAP_KILL CAP_MKNOD CAP_NET_ADMIN CAP_NET_RAW CAP_SETGID CAP_SETUID CAP_SYS_ADMIN CAP_SYS_CHROOT",
|
||||
"ReadWritePaths=/var/lib/banger",
|
||||
"RuntimeDirectory=banger-root",
|
||||
"RuntimeDirectoryMode=0711",
|
||||
|
|
|
|||
|
|
@ -183,7 +183,7 @@ func TestRenderRootHelperSystemdUnitIncludesRequiredCapabilities(t *testing.T) {
|
|||
"ProtectSystem=strict",
|
||||
"ProtectHome=yes",
|
||||
"RestrictAddressFamilies=AF_UNIX AF_INET AF_INET6 AF_NETLINK AF_VSOCK",
|
||||
"CapabilityBoundingSet=CAP_CHOWN CAP_DAC_OVERRIDE CAP_NET_ADMIN CAP_NET_RAW CAP_SYS_ADMIN",
|
||||
"CapabilityBoundingSet=CAP_CHOWN CAP_DAC_OVERRIDE CAP_FOWNER CAP_KILL CAP_MKNOD CAP_NET_ADMIN CAP_NET_RAW CAP_SETGID CAP_SETUID CAP_SYS_ADMIN CAP_SYS_CHROOT",
|
||||
"ReadWritePaths=/var/lib/banger",
|
||||
"RuntimeDirectory=banger-root",
|
||||
"RuntimeDirectoryMode=0711",
|
||||
|
|
|
|||
|
|
@ -22,6 +22,9 @@ import (
|
|||
type fileConfig struct {
|
||||
LogLevel string `toml:"log_level"`
|
||||
FirecrackerBin string `toml:"firecracker_bin"`
|
||||
JailerBin string `toml:"jailer_bin"`
|
||||
JailerEnabled *bool `toml:"jailer_enabled"`
|
||||
JailerChrootBase string `toml:"jailer_chroot_base"`
|
||||
SSHKeyPath string `toml:"ssh_key_path"`
|
||||
DefaultImageName string `toml:"default_image_name"`
|
||||
AutoStopStaleAfter string `toml:"auto_stop_stale_after"`
|
||||
|
|
@ -75,6 +78,13 @@ func load(layout paths.Layout, home string, ensureDefaultSSHKey bool) (model.Dae
|
|||
DefaultDNS: model.DefaultDNS,
|
||||
DefaultImageName: "debian-bookworm",
|
||||
HostHomeDir: home,
|
||||
JailerBin: model.DefaultJailerBinary,
|
||||
JailerEnabled: true,
|
||||
// Chroot lives under StateDir (ext4) — not RuntimeDir (tmpfs).
|
||||
// Hard-linking the kernel and any file-backed drives into the
|
||||
// chroot requires same-filesystem; images already live under
|
||||
// StateDir, so colocating the chroot avoids EXDEV.
|
||||
JailerChrootBase: filepath.Join(layout.StateDir, "jail"),
|
||||
}
|
||||
|
||||
var file fileConfig
|
||||
|
|
@ -99,6 +109,15 @@ func load(layout paths.Layout, home string, ensureDefaultSSHKey bool) (model.Dae
|
|||
} else if path, err := system.LookupExecutable("firecracker"); err == nil {
|
||||
cfg.FirecrackerBin = path
|
||||
}
|
||||
if value := strings.TrimSpace(file.JailerBin); value != "" {
|
||||
cfg.JailerBin = value
|
||||
}
|
||||
if file.JailerEnabled != nil {
|
||||
cfg.JailerEnabled = *file.JailerEnabled
|
||||
}
|
||||
if value := strings.TrimSpace(file.JailerChrootBase); value != "" {
|
||||
cfg.JailerChrootBase = value
|
||||
}
|
||||
if value := strings.TrimSpace(file.DefaultImageName); value != "" {
|
||||
cfg.DefaultImageName = value
|
||||
}
|
||||
|
|
|
|||
|
|
@ -9,6 +9,8 @@ import (
|
|||
"log/slog"
|
||||
"net"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
|
|
@ -88,6 +90,14 @@ func OpenSystem(ctx context.Context) (*Daemon, error) {
|
|||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
// config.Load fills JailerChrootBase from the layout it sees. In
|
||||
// system mode that's the owner's layout (no privileged StateDir) so
|
||||
// the value lands under the owner home — wrong for the helper, which
|
||||
// validates paths against the system StateDir. Override unconditionally
|
||||
// here so both daemon and helper see /var/lib/banger/jail.
|
||||
if strings.TrimSpace(cfg.JailerChrootBase) == "" || !filepath.IsAbs(cfg.JailerChrootBase) || strings.HasPrefix(cfg.JailerChrootBase, ownerLayout.StateDir) {
|
||||
cfg.JailerChrootBase = filepath.Join(layout.StateDir, "jail")
|
||||
}
|
||||
helper := newHelperPrivilegedOps(roothelper.NewClient(installmeta.DefaultRootHelperSocketPath), cfg, layout)
|
||||
return openWithConfig(ctx, layout, ownerLayout, cfg, -1, -1, false, helper)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -11,11 +11,15 @@ import (
|
|||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"golang.org/x/sys/unix"
|
||||
|
||||
"banger/internal/firecracker"
|
||||
"banger/internal/system"
|
||||
)
|
||||
|
|
@ -271,6 +275,306 @@ func (m *Manager) Kill(ctx context.Context, pid int) error {
|
|||
return err
|
||||
}
|
||||
|
||||
// ChrootDriveSpec describes how a single drive should appear inside the
|
||||
// jailer chroot. HostPath is the host-side source (a regular file or a
|
||||
// /dev/mapper/* block device); ChrootName is the bare filename it should
|
||||
// be reachable as inside the chroot (e.g. "rootfs"). The DM block device
|
||||
// case is detected via os.Stat (S_IFBLK) — the helper mknods a matching
|
||||
// node; everything else is hard-linked.
|
||||
type ChrootDriveSpec struct {
|
||||
ChrootName string
|
||||
HostPath string
|
||||
}
|
||||
|
||||
// PrepareJailerChroot stages the chroot tree at chrootRoot for the jailer
|
||||
// to take over on launch. After this call:
|
||||
//
|
||||
// - chrootRoot exists, mode 0700, owned by uid:gid.
|
||||
// - chrootRoot/<kernel-name> is a hard link of kernelHostPath, owned uid:gid.
|
||||
// - chrootRoot/<initrd-name> is a hard link of initrdHostPath if set.
|
||||
// - For each drive: a hard link (regular file source) or a freshly
|
||||
// mknod'd block device with the source's major/minor (DM source).
|
||||
// - If wantVSock, /dev/vhost-vsock is mknod'd into the chroot so
|
||||
// firecracker can open it after chroot.
|
||||
//
|
||||
// All filesystem mutations go through runner.RunSudo when the caller isn't
|
||||
// root, so this works in dev (sudo) and system (root helper) modes alike.
|
||||
// Path components are validated by the caller (roothelper) — this helper
|
||||
// trusts them.
|
||||
func (m *Manager) PrepareJailerChroot(ctx context.Context, chrootRoot string, uid, gid int, firecrackerHostPath, kernelHostPath, kernelName, initrdHostPath, initrdName string, drives []ChrootDriveSpec, wantVSock bool) error {
|
||||
if strings.TrimSpace(chrootRoot) == "" {
|
||||
return fmt.Errorf("chroot root is required")
|
||||
}
|
||||
if err := m.sudo(ctx, "mkdir", "-p", chrootRoot); err != nil {
|
||||
return fmt.Errorf("create chroot root: %w", err)
|
||||
}
|
||||
if err := m.sudo(ctx, "chmod", "0700", chrootRoot); err != nil {
|
||||
return fmt.Errorf("chmod chroot root: %w", err)
|
||||
}
|
||||
if err := m.chown(ctx, chrootRoot, uid, gid); err != nil {
|
||||
return fmt.Errorf("chown chroot root: %w", err)
|
||||
}
|
||||
// The daemon (uid) needs to traverse the intermediate directories to reach
|
||||
// the sockets firecracker creates inside the chroot. The per-VM dir
|
||||
// (<base>/firecracker/<vmid>/) is chowned to uid so the daemon can reach
|
||||
// <vmid>/root/. The <base>/firecracker/ base and <base>/jail/ dirs get
|
||||
// world-execute (--x) so any UID can traverse through them without listing
|
||||
// their contents (the per-VM dirs are still protected by their own mode).
|
||||
vmDir := filepath.Dir(chrootRoot)
|
||||
if err := m.chown(ctx, vmDir, uid, gid); err != nil {
|
||||
return fmt.Errorf("chown vm dir: %w", err)
|
||||
}
|
||||
fcBaseDir := filepath.Dir(vmDir)
|
||||
if err := m.sudo(ctx, "chmod", "0711", fcBaseDir); err != nil {
|
||||
return fmt.Errorf("chmod firecracker base dir: %w", err)
|
||||
}
|
||||
jailBaseDir := filepath.Dir(fcBaseDir)
|
||||
if err := m.sudo(ctx, "chmod", "0711", jailBaseDir); err != nil {
|
||||
return fmt.Errorf("chmod jail base dir: %w", err)
|
||||
}
|
||||
// Order matters: hard-link the kernel + file-backed drives BEFORE
|
||||
// the self-bind below. link(2) refuses to cross mount points even
|
||||
// when the underlying superblock is the same — once chrootRoot is a
|
||||
// mount point, `ln /var/lib/.../kernel <chroot>/vmlinux` returns
|
||||
// EXDEV.
|
||||
if err := m.linkInto(ctx, chrootRoot, kernelHostPath, kernelName, uid, gid); err != nil {
|
||||
return fmt.Errorf("link kernel: %w", err)
|
||||
}
|
||||
if strings.TrimSpace(initrdHostPath) != "" {
|
||||
if err := m.linkInto(ctx, chrootRoot, initrdHostPath, initrdName, uid, gid); err != nil {
|
||||
return fmt.Errorf("link initrd: %w", err)
|
||||
}
|
||||
}
|
||||
for _, d := range drives {
|
||||
if err := m.stageDrive(ctx, chrootRoot, d, uid, gid); err != nil {
|
||||
return fmt.Errorf("stage drive %s: %w", d.ChrootName, err)
|
||||
}
|
||||
}
|
||||
if wantVSock {
|
||||
// The jailer creates /dev inside the chroot, but /dev/vhost-vsock must
|
||||
// be pre-staged so firecracker can open it after the jailer chroots.
|
||||
devDir := chrootRoot + "/dev"
|
||||
if err := m.sudo(ctx, "mkdir", "-p", devDir); err != nil {
|
||||
return fmt.Errorf("create chroot/dev: %w", err)
|
||||
}
|
||||
if err := m.chown(ctx, devDir, uid, gid); err != nil {
|
||||
return fmt.Errorf("chown chroot/dev: %w", err)
|
||||
}
|
||||
if err := m.stageDevice(ctx, chrootRoot, "dev/vhost-vsock", "/dev/vhost-vsock", uid, gid); err != nil {
|
||||
return fmt.Errorf("stage vhost-vsock: %w", err)
|
||||
}
|
||||
}
|
||||
// Bind firecracker + the host libdirs into the chroot read-only.
|
||||
// firecracker is dynamically linked (interpreter /lib64/ld-linux-*,
|
||||
// libc, libgcc), and inside the chroot ENOENT on those is reported
|
||||
// as "Failed to exec into Firecracker: No such file or directory" —
|
||||
// the kernel's misleading ENOENT-for-missing-interpreter error.
|
||||
//
|
||||
// Done last so the link/mknod steps above don't have to cross the
|
||||
// self-bind mount boundary (link(2) returns EXDEV at mount edges).
|
||||
// Self-bind first so CleanupJailerChroot's `umount -lR` can recurse
|
||||
// from chrootRoot itself; --make-private blocks propagation back to
|
||||
// the host mount namespace.
|
||||
// firecracker is copied (not bind-mounted) because jailer opens the
|
||||
// binary O_RDWR — apparently to seal it or rewrite something — and
|
||||
// fails with EROFS on a ro-bind.
|
||||
chrootFC := chrootRoot + "/" + filepath.Base(firecrackerHostPath)
|
||||
if err := m.sudo(ctx, "cp", "-f", firecrackerHostPath, chrootFC); err != nil {
|
||||
return fmt.Errorf("copy firecracker into chroot: %w", err)
|
||||
}
|
||||
if err := m.sudo(ctx, "chmod", "0755", chrootFC); err != nil {
|
||||
return fmt.Errorf("chmod firecracker in chroot: %w", err)
|
||||
}
|
||||
if err := m.chown(ctx, chrootFC, uid, gid); err != nil {
|
||||
return fmt.Errorf("chown firecracker in chroot: %w", err)
|
||||
}
|
||||
if err := m.sudo(ctx, "mount", "--bind", chrootRoot, chrootRoot); err != nil {
|
||||
return fmt.Errorf("self-bind chroot: %w", err)
|
||||
}
|
||||
// Remount without nosuid: the helper unit's ReadWritePaths binding marks
|
||||
// /var/lib/banger nosuid, and bind mounts inherit that flag. The jailer
|
||||
// needs to exec /firecracker as UID 1000, which the kernel denies on a
|
||||
// nosuid mount when NoNewPrivileges is set on the unit.
|
||||
if err := m.sudo(ctx, "mount", "-o", "remount,bind,suid", chrootRoot, chrootRoot); err != nil {
|
||||
return fmt.Errorf("remount chroot suid: %w", err)
|
||||
}
|
||||
if err := m.sudo(ctx, "mount", "--make-private", chrootRoot); err != nil {
|
||||
return fmt.Errorf("make-private chroot: %w", err)
|
||||
}
|
||||
// Pre-create /usr with world-traversable permissions. UMask=0077 on the
|
||||
// helper unit causes plain mkdir to produce 0700 dirs; UID 1000 must be
|
||||
// able to traverse /usr/ to reach the dynamic linker via lib64 → usr/lib.
|
||||
if err := m.sudo(ctx, "install", "-d", "-m", "0755", chrootRoot+"/usr"); err != nil {
|
||||
return fmt.Errorf("create chroot/usr: %w", err)
|
||||
}
|
||||
// Bind real libdirs and replicate the host's compat symlinks
|
||||
// (/lib64 → /usr/lib, etc) inside the chroot so firecracker's
|
||||
// PT_INTERP path (/lib64/ld-linux-*) resolves to the bound libs.
|
||||
for _, libDir := range []string{"/usr/lib", "/usr/lib64", "/lib", "/lib64"} {
|
||||
info, err := os.Lstat(libDir)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
target := chrootRoot + libDir
|
||||
if info.Mode()&os.ModeSymlink != 0 {
|
||||
link, err := os.Readlink(libDir)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if err := m.sudo(ctx, "ln", "-sfn", link, target); err != nil {
|
||||
return fmt.Errorf("symlink %s -> %s: %w", target, link, err)
|
||||
}
|
||||
continue
|
||||
}
|
||||
if !info.IsDir() {
|
||||
continue
|
||||
}
|
||||
if err := m.bindDir(ctx, libDir, target, true); err != nil {
|
||||
return fmt.Errorf("bind %s: %w", libDir, err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// CleanupJailerChroot tears down a chroot built by PrepareJailerChroot:
|
||||
// lazy-recursive umount of every mount under (or at) chrootRoot, then a
|
||||
// findmnt-guarded `rm -rf`. The guard is load-bearing: if any bind mount
|
||||
// remained, `rm -rf` would descend into the bind source (e.g. /usr/lib)
|
||||
// and start deleting host files. The umount runs `-l` (lazy) so an in-use
|
||||
// bind point still gets detached from the namespace; the guarded check
|
||||
// then catches the rare case where detachment didn't happen.
|
||||
func (m *Manager) CleanupJailerChroot(ctx context.Context, chrootRoot string) error {
|
||||
if strings.TrimSpace(chrootRoot) == "" {
|
||||
return nil
|
||||
}
|
||||
if _, err := os.Stat(chrootRoot); os.IsNotExist(err) {
|
||||
return nil
|
||||
}
|
||||
// Best-effort umount: for chroots that were never bind-mounted (a
|
||||
// stale install pre-bind-mount work, say) this fails — that's fine,
|
||||
// the findmnt guard below is what enforces safety.
|
||||
_ = m.sudoIgnore(ctx, "umount", "--recursive", "--lazy", chrootRoot)
|
||||
if mounts, err := m.mountsUnder(ctx, chrootRoot); err != nil {
|
||||
return fmt.Errorf("inspect chroot mounts: %w", err)
|
||||
} else if len(mounts) > 0 {
|
||||
return fmt.Errorf("refusing to rm -rf %q: still has %d mount(s): %v", chrootRoot, len(mounts), mounts)
|
||||
}
|
||||
return m.sudo(ctx, "rm", "-rf", "--", chrootRoot)
|
||||
}
|
||||
|
||||
func (m *Manager) sudoIgnore(ctx context.Context, name string, args ...string) error {
|
||||
err := m.sudo(ctx, name, args...)
|
||||
return err
|
||||
}
|
||||
|
||||
func (m *Manager) bindFile(ctx context.Context, source, target string, readOnly bool) error {
|
||||
if err := m.sudo(ctx, "install", "-D", "-m", "0644", "/dev/null", target); err != nil {
|
||||
return fmt.Errorf("create bind target file: %w", err)
|
||||
}
|
||||
return m.bindMount(ctx, source, target, readOnly)
|
||||
}
|
||||
|
||||
func (m *Manager) bindDir(ctx context.Context, source, target string, readOnly bool) error {
|
||||
if err := m.sudo(ctx, "mkdir", "-p", target); err != nil {
|
||||
return fmt.Errorf("create bind target dir: %w", err)
|
||||
}
|
||||
return m.bindMount(ctx, source, target, readOnly)
|
||||
}
|
||||
|
||||
func (m *Manager) bindMount(ctx context.Context, source, target string, readOnly bool) error {
|
||||
if err := m.sudo(ctx, "mount", "--bind", source, target); err != nil {
|
||||
return err
|
||||
}
|
||||
if !readOnly {
|
||||
return nil
|
||||
}
|
||||
// Single-step ro bind isn't honored by all kernels — the bind happens
|
||||
// rw and the ro flag is silently ignored. Remount makes it stick.
|
||||
return m.sudo(ctx, "mount", "-o", "remount,bind,ro", target)
|
||||
}
|
||||
|
||||
// mountsUnder returns the list of mount targets at or under chrootRoot.
|
||||
// findmnt's output is one path per line; an empty list means no leftovers.
|
||||
func (m *Manager) mountsUnder(ctx context.Context, chrootRoot string) ([]string, error) {
|
||||
out, err := m.runner.Run(ctx, "findmnt", "--output", "TARGET", "--list", "--noheadings")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var mounts []string
|
||||
prefix := chrootRoot + string(os.PathSeparator)
|
||||
for _, line := range strings.Split(string(out), "\n") {
|
||||
t := strings.TrimSpace(line)
|
||||
if t == chrootRoot || strings.HasPrefix(t, prefix) {
|
||||
mounts = append(mounts, t)
|
||||
}
|
||||
}
|
||||
return mounts, nil
|
||||
}
|
||||
|
||||
func (m *Manager) stageDrive(ctx context.Context, chrootRoot string, d ChrootDriveSpec, uid, gid int) error {
|
||||
info, err := os.Stat(d.HostPath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if info.Mode()&os.ModeDevice != 0 {
|
||||
stat, ok := info.Sys().(*syscall.Stat_t)
|
||||
if !ok {
|
||||
return fmt.Errorf("stat %s: cannot read device numbers", d.HostPath)
|
||||
}
|
||||
major := unix.Major(stat.Rdev)
|
||||
minor := unix.Minor(stat.Rdev)
|
||||
return m.mknodBlock(ctx, chrootRoot, d.ChrootName, major, minor, uid, gid)
|
||||
}
|
||||
return m.linkInto(ctx, chrootRoot, d.HostPath, d.ChrootName, uid, gid)
|
||||
}
|
||||
|
||||
func (m *Manager) stageDevice(ctx context.Context, chrootRoot, chrootName, hostDevice string, uid, gid int) error {
|
||||
info, err := os.Stat(hostDevice)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
stat, ok := info.Sys().(*syscall.Stat_t)
|
||||
if !ok {
|
||||
return fmt.Errorf("stat %s: cannot read device numbers", hostDevice)
|
||||
}
|
||||
major := unix.Major(stat.Rdev)
|
||||
minor := unix.Minor(stat.Rdev)
|
||||
target := chrootRoot + "/" + chrootName
|
||||
if err := m.sudo(ctx, "mknod", "-m", "0660", target, "c", strconv.FormatUint(uint64(major), 10), strconv.FormatUint(uint64(minor), 10)); err != nil {
|
||||
return err
|
||||
}
|
||||
return m.chown(ctx, target, uid, gid)
|
||||
}
|
||||
|
||||
func (m *Manager) mknodBlock(ctx context.Context, chrootRoot, name string, major, minor uint32, uid, gid int) error {
|
||||
target := chrootRoot + "/" + name
|
||||
if err := m.sudo(ctx, "mknod", "-m", "0660", target, "b", strconv.FormatUint(uint64(major), 10), strconv.FormatUint(uint64(minor), 10)); err != nil {
|
||||
return err
|
||||
}
|
||||
return m.chown(ctx, target, uid, gid)
|
||||
}
|
||||
|
||||
func (m *Manager) linkInto(ctx context.Context, chrootRoot, source, name string, uid, gid int) error {
|
||||
target := chrootRoot + "/" + name
|
||||
if err := m.sudo(ctx, "ln", "-f", source, target); err != nil {
|
||||
return err
|
||||
}
|
||||
return m.chown(ctx, target, uid, gid)
|
||||
}
|
||||
|
||||
func (m *Manager) chown(ctx context.Context, target string, uid, gid int) error {
|
||||
return m.sudo(ctx, "chown", fmt.Sprintf("%d:%d", uid, gid), target)
|
||||
}
|
||||
|
||||
func (m *Manager) sudo(ctx context.Context, name string, args ...string) error {
|
||||
if os.Geteuid() == 0 {
|
||||
_, err := m.runner.Run(ctx, name, args...)
|
||||
return err
|
||||
}
|
||||
_, err := m.runner.RunSudo(ctx, append([]string{name}, args...)...)
|
||||
return err
|
||||
}
|
||||
|
||||
func waitForPath(ctx context.Context, path string, timeout time.Duration, label string) error {
|
||||
return pollPath(ctx, path, timeout, 100*time.Millisecond, label)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3,8 +3,10 @@ package daemon
|
|||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
|
|
@ -39,6 +41,7 @@ type privilegedOps interface {
|
|||
KillProcess(context.Context, int) error
|
||||
SignalProcess(context.Context, int, string) error
|
||||
ProcessRunning(context.Context, int, string) (bool, error)
|
||||
CleanupJailerChroot(context.Context, string) error
|
||||
}
|
||||
|
||||
type localPrivilegedOps struct {
|
||||
|
|
@ -170,7 +173,77 @@ func (o *localPrivilegedOps) ResolveFirecrackerBinary(_ context.Context, request
|
|||
}
|
||||
|
||||
func (o *localPrivilegedOps) LaunchFirecracker(ctx context.Context, req roothelper.FirecrackerLaunchRequest) (int, error) {
|
||||
machine, err := firecracker.NewMachine(ctx, firecracker.MachineConfig{
|
||||
mc, err := o.buildLaunchMachineConfig(ctx, req)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
// Symlink before Start: with jailer the actual API socket lives at
|
||||
// `<chroot>/firecracker.socket` (~120+ bytes — over the AF_UNIX
|
||||
// sun_path limit of 108). The SDK's waitForSocket and connect(2)
|
||||
// would EINVAL on the long path. Pre-creating the symlink at the
|
||||
// short req.SocketPath lets the SDK poll/connect via the short
|
||||
// path; the kernel only enforces sun_path on the path you pass,
|
||||
// not on the resolved target.
|
||||
if err := o.exposeJailerSockets(req); err != nil {
|
||||
return 0, fmt.Errorf("expose jailer sockets: %w", err)
|
||||
}
|
||||
machine, err := firecracker.NewMachine(ctx, mc)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
chownDone := o.maybeChownSockets(ctx, req, mc)
|
||||
startErr := machine.Start(ctx)
|
||||
chownErr := <-chownDone
|
||||
if startErr != nil {
|
||||
if pid := o.fc().ResolvePID(context.Background(), machine, mc.SocketPath); pid > 0 {
|
||||
_ = o.KillProcess(context.Background(), pid)
|
||||
}
|
||||
return 0, startErr
|
||||
}
|
||||
if chownErr != nil {
|
||||
return 0, chownErr
|
||||
}
|
||||
if req.Jailer == nil {
|
||||
// Belt-and-suspenders for the legacy direct-firecracker path.
|
||||
// The jailer path doesn't need this — firecracker drops to the
|
||||
// configured uid before creating the socket.
|
||||
if err := o.EnsureSocketAccess(ctx, mc.SocketPath, "firecracker api socket"); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
if strings.TrimSpace(mc.VSockPath) != "" {
|
||||
if err := o.EnsureSocketAccess(ctx, mc.VSockPath, "firecracker vsock socket"); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
}
|
||||
pid := o.fc().ResolvePID(context.Background(), machine, mc.SocketPath)
|
||||
if pid <= 0 {
|
||||
return 0, errors.New("firecracker started but pid could not be resolved")
|
||||
}
|
||||
return pid, nil
|
||||
}
|
||||
|
||||
// maybeChownSockets runs the post-Start sudo-chown race only on the legacy
|
||||
// direct-firecracker path. With the jailer the firecracker process is
|
||||
// already running as the configured uid before it creates the socket, so
|
||||
// no chown is needed (and chown on the symlink would tweak the symlink's
|
||||
// metadata — not the target's — anyway).
|
||||
func (o *localPrivilegedOps) maybeChownSockets(ctx context.Context, req roothelper.FirecrackerLaunchRequest, mc firecracker.MachineConfig) <-chan error {
|
||||
if req.Jailer != nil {
|
||||
ch := make(chan error, 1)
|
||||
ch <- nil
|
||||
close(ch)
|
||||
return ch
|
||||
}
|
||||
return o.fc().EnsureSocketAccessForAsync(ctx, []string{mc.SocketPath, mc.VSockPath}, o.clientUID, o.clientGID)
|
||||
}
|
||||
|
||||
// buildLaunchMachineConfig mirrors the helper-side equivalent: when jailer
|
||||
// is enabled, stage the chroot tree and rewrite the path fields to their
|
||||
// chroot-translated form (host-visible for sockets, chroot-internal for
|
||||
// kernel/drives — see firecracker.MachineConfig.Jailer doc).
|
||||
func (o *localPrivilegedOps) buildLaunchMachineConfig(ctx context.Context, req roothelper.FirecrackerLaunchRequest) (firecracker.MachineConfig, error) {
|
||||
mc := firecracker.MachineConfig{
|
||||
BinaryPath: req.BinaryPath,
|
||||
VMID: req.VMID,
|
||||
SocketPath: req.SocketPath,
|
||||
|
|
@ -186,40 +259,101 @@ func (o *localPrivilegedOps) LaunchFirecracker(ctx context.Context, req roothelp
|
|||
VCPUCount: req.VCPUCount,
|
||||
MemoryMiB: req.MemoryMiB,
|
||||
Logger: o.logger,
|
||||
})
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
// Race the chown against the SDK's HTTP probe inside Start: when the
|
||||
// daemon is non-root, firecracker is launched under sudo and the API
|
||||
// socket appears root-owned. Without a concurrent chown the SDK's
|
||||
// connect(2) gets EACCES and Start times out before our post-Start
|
||||
// EnsureSocketAccess can ever run.
|
||||
chownDone := o.fc().EnsureSocketAccessForAsync(ctx, []string{req.SocketPath, req.VSockPath}, o.clientUID, o.clientGID)
|
||||
startErr := machine.Start(ctx)
|
||||
chownErr := <-chownDone
|
||||
if startErr != nil {
|
||||
if pid := o.fc().ResolvePID(context.Background(), machine, req.SocketPath); pid > 0 {
|
||||
_ = o.KillProcess(context.Background(), pid)
|
||||
}
|
||||
return 0, startErr
|
||||
if req.Jailer == nil {
|
||||
return mc, nil
|
||||
}
|
||||
if chownErr != nil {
|
||||
return 0, chownErr
|
||||
chrootRoot := firecracker.JailerChrootRoot(req.Jailer.ChrootBaseDir, req.VMID)
|
||||
driveSpecs := make([]fcproc.ChrootDriveSpec, 0, len(req.Drives))
|
||||
chrootDrives := make([]firecracker.DriveConfig, 0, len(req.Drives))
|
||||
for _, d := range req.Drives {
|
||||
name := chrootDriveName(d)
|
||||
driveSpecs = append(driveSpecs, fcproc.ChrootDriveSpec{ChrootName: name, HostPath: d.Path})
|
||||
chrootDrives = append(chrootDrives, firecracker.DriveConfig{
|
||||
ID: d.ID,
|
||||
Path: "/" + name,
|
||||
ReadOnly: d.ReadOnly,
|
||||
IsRoot: d.IsRoot,
|
||||
})
|
||||
}
|
||||
if err := o.EnsureSocketAccess(ctx, req.SocketPath, "firecracker api socket"); err != nil {
|
||||
return 0, err
|
||||
wantVSock := strings.TrimSpace(req.VSockPath) != ""
|
||||
if err := o.fc().PrepareJailerChroot(ctx, chrootRoot,
|
||||
req.Jailer.UID, req.Jailer.GID,
|
||||
req.BinaryPath,
|
||||
req.KernelImagePath, "vmlinux",
|
||||
req.InitrdPath, "initrd",
|
||||
driveSpecs, wantVSock,
|
||||
); err != nil {
|
||||
return firecracker.MachineConfig{}, fmt.Errorf("prepare jailer chroot: %w", err)
|
||||
}
|
||||
// SocketPath stays the short request path: the SDK polls/connects
|
||||
// to it via os.Stat / net.Dial("unix", ...), and AF_UNIX sun_path
|
||||
// is hard-capped at 108 bytes — the actual chroot path is well over
|
||||
// that. exposeJailerSockets pre-creates the req.SocketPath as a
|
||||
// symlink whose target is the long chroot socket; the kernel only
|
||||
// enforces sun_path on the path you hand to connect, not on the
|
||||
// resolved target.
|
||||
//
|
||||
// VSockPath, by contrast, is sent to firecracker via the API and
|
||||
// resolved from inside the chroot, so it must be the chroot-internal
|
||||
// path. The host-visible vsock socket is reachable via a symlink
|
||||
// at req.VSockPath, also installed by exposeJailerSockets.
|
||||
_ = chrootRoot
|
||||
if wantVSock {
|
||||
mc.VSockPath = firecracker.JailerVSockName
|
||||
}
|
||||
mc.KernelImagePath = "/vmlinux"
|
||||
if strings.TrimSpace(req.InitrdPath) != "" {
|
||||
mc.InitrdPath = "/initrd"
|
||||
} else {
|
||||
mc.InitrdPath = ""
|
||||
}
|
||||
mc.Drives = chrootDrives
|
||||
// LogPath stays set so buildProcessRunner's openLogFile captures firecracker
|
||||
// stderr via cmd.Stderr. buildConfig clears sdk.Config.LogPath for jailer
|
||||
// mode to avoid PUT /logger with a host path firecracker can't open.
|
||||
mc.MetricsPath = ""
|
||||
mc.Jailer = &firecracker.JailerOpts{
|
||||
Binary: req.Jailer.Binary,
|
||||
ChrootBaseDir: req.Jailer.ChrootBaseDir,
|
||||
UID: req.Jailer.UID,
|
||||
GID: req.Jailer.GID,
|
||||
}
|
||||
return mc, nil
|
||||
}
|
||||
|
||||
func (o *localPrivilegedOps) exposeJailerSockets(req roothelper.FirecrackerLaunchRequest) error {
|
||||
if req.Jailer == nil {
|
||||
return nil
|
||||
}
|
||||
chrootRoot := firecracker.JailerChrootRoot(req.Jailer.ChrootBaseDir, req.VMID)
|
||||
hostAPI := filepath.Join(chrootRoot, strings.TrimPrefix(firecracker.JailerSocketName, "/"))
|
||||
if err := atomicSymlink(hostAPI, req.SocketPath); err != nil {
|
||||
return err
|
||||
}
|
||||
if strings.TrimSpace(req.VSockPath) != "" {
|
||||
if err := o.EnsureSocketAccess(ctx, req.VSockPath, "firecracker vsock socket"); err != nil {
|
||||
return 0, err
|
||||
hostVSock := filepath.Join(chrootRoot, strings.TrimPrefix(firecracker.JailerVSockName, "/"))
|
||||
if err := atomicSymlink(hostVSock, req.VSockPath); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
pid := o.fc().ResolvePID(context.Background(), machine, req.SocketPath)
|
||||
if pid <= 0 {
|
||||
return 0, errors.New("firecracker started but pid could not be resolved")
|
||||
return nil
|
||||
}
|
||||
|
||||
// chrootDriveName mirrors the helper-side helper of the same name; kept as
|
||||
// a free function so both paths produce identical chroot layouts.
|
||||
func chrootDriveName(d firecracker.DriveConfig) string {
|
||||
if id := strings.TrimSpace(d.ID); id != "" {
|
||||
return id
|
||||
}
|
||||
return pid, nil
|
||||
return filepath.Base(d.Path)
|
||||
}
|
||||
|
||||
func atomicSymlink(target, link string) error {
|
||||
if err := os.Remove(link); err != nil && !os.IsNotExist(err) {
|
||||
return err
|
||||
}
|
||||
return os.Symlink(target, link)
|
||||
}
|
||||
|
||||
func (o *localPrivilegedOps) EnsureSocketAccess(ctx context.Context, socketPath, label string) error {
|
||||
|
|
@ -246,6 +380,10 @@ func (o *localPrivilegedOps) ProcessRunning(_ context.Context, pid int, apiSock
|
|||
return system.ProcessRunning(pid, apiSock), nil
|
||||
}
|
||||
|
||||
func (o *localPrivilegedOps) CleanupJailerChroot(ctx context.Context, chrootRoot string) error {
|
||||
return o.fc().CleanupJailerChroot(ctx, chrootRoot)
|
||||
}
|
||||
|
||||
func (o *localPrivilegedOps) fc() *fcproc.Manager {
|
||||
return fcproc.New(o.runner, fcproc.Config{
|
||||
FirecrackerBin: normalizeFirecrackerBinary("", o.config.FirecrackerBin),
|
||||
|
|
@ -320,7 +458,27 @@ func (o *helperPrivilegedOps) ResolveFirecrackerBinary(ctx context.Context, requ
|
|||
|
||||
func (o *helperPrivilegedOps) LaunchFirecracker(ctx context.Context, req roothelper.FirecrackerLaunchRequest) (int, error) {
|
||||
req.Network = o.networkConfig()
|
||||
return o.client.LaunchFirecracker(ctx, req)
|
||||
pid, err := o.client.LaunchFirecracker(ctx, req)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
// The root helper runs with PrivateMounts=yes, so symlinks it creates
|
||||
// (exposeJailerSockets) are invisible to the daemon's namespace. Re-create
|
||||
// them here so the daemon can reach the API and vsock sockets.
|
||||
if req.Jailer != nil {
|
||||
chrootRoot := firecracker.JailerChrootRoot(req.Jailer.ChrootBaseDir, req.VMID)
|
||||
hostAPI := filepath.Join(chrootRoot, strings.TrimPrefix(firecracker.JailerSocketName, "/"))
|
||||
if err := atomicSymlink(hostAPI, req.SocketPath); err != nil {
|
||||
return 0, fmt.Errorf("api socket symlink: %w", err)
|
||||
}
|
||||
if strings.TrimSpace(req.VSockPath) != "" {
|
||||
hostVSock := filepath.Join(chrootRoot, strings.TrimPrefix(firecracker.JailerVSockName, "/"))
|
||||
if err := atomicSymlink(hostVSock, req.VSockPath); err != nil {
|
||||
return 0, fmt.Errorf("vsock symlink: %w", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
return pid, nil
|
||||
}
|
||||
|
||||
func (o *helperPrivilegedOps) EnsureSocketAccess(ctx context.Context, socketPath, label string) error {
|
||||
|
|
@ -348,6 +506,10 @@ func (o *helperPrivilegedOps) ProcessRunning(ctx context.Context, pid int, apiSo
|
|||
return o.client.ProcessRunning(ctx, pid, apiSock)
|
||||
}
|
||||
|
||||
func (o *helperPrivilegedOps) CleanupJailerChroot(ctx context.Context, chrootRoot string) error {
|
||||
return o.client.CleanupJailerChroot(ctx, chrootRoot)
|
||||
}
|
||||
|
||||
func (o *helperPrivilegedOps) networkConfig() roothelper.NetworkConfig {
|
||||
return roothelper.NetworkConfig{
|
||||
BridgeName: o.config.BridgeName,
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ import (
|
|||
"time"
|
||||
|
||||
"banger/internal/daemon/fcproc"
|
||||
"banger/internal/firecracker"
|
||||
"banger/internal/model"
|
||||
"banger/internal/namegen"
|
||||
"banger/internal/system"
|
||||
|
|
@ -149,14 +150,35 @@ func (s *VMService) cleanupRuntime(ctx context.Context, vm model.VMRecord, prese
|
|||
if vm.Runtime.VSockPath != "" {
|
||||
_ = os.Remove(vm.Runtime.VSockPath)
|
||||
}
|
||||
// Remove the jailer chroot tree (kernel hard-links, mknod'd device
|
||||
// nodes, the chroot root itself). Skipped silently when the jailer
|
||||
// is disabled or the chroot was never created. We intentionally
|
||||
// don't gate on JailerEnabled today — old VMs created before the
|
||||
// flag flipped on still need their chroots removed if any exist.
|
||||
jailerErr := s.cleanupJailerChroot(ctx, vm)
|
||||
// The handles are only meaningful while the kernel objects exist;
|
||||
// dropping them here keeps the cache in sync with reality even
|
||||
// when the caller forgets to call clearVMHandles explicitly.
|
||||
s.clearVMHandles(vm)
|
||||
if !preserveDisks && vm.Runtime.VMDir != "" {
|
||||
return errors.Join(waitErr, snapshotErr, featureErr, tapErr, os.RemoveAll(vm.Runtime.VMDir))
|
||||
return errors.Join(waitErr, snapshotErr, featureErr, tapErr, jailerErr, os.RemoveAll(vm.Runtime.VMDir))
|
||||
}
|
||||
return errors.Join(waitErr, snapshotErr, featureErr, tapErr)
|
||||
return errors.Join(waitErr, snapshotErr, featureErr, tapErr, jailerErr)
|
||||
}
|
||||
|
||||
// cleanupJailerChroot removes the per-VM chroot tree if it exists. Returns
|
||||
// nil silently when the jailer was never enabled or the chroot path can't
|
||||
// be computed (no JailerChrootBase configured).
|
||||
func (s *VMService) cleanupJailerChroot(ctx context.Context, vm model.VMRecord) error {
|
||||
base := strings.TrimSpace(s.config.JailerChrootBase)
|
||||
if base == "" {
|
||||
return nil
|
||||
}
|
||||
chrootRoot := firecracker.JailerChrootRoot(base, vm.ID)
|
||||
if _, err := os.Stat(chrootRoot); os.IsNotExist(err) {
|
||||
return nil
|
||||
}
|
||||
return s.privOps().CleanupJailerChroot(ctx, chrootRoot)
|
||||
}
|
||||
|
||||
func (s *VMService) generateName(ctx context.Context) (string, error) {
|
||||
|
|
|
|||
|
|
@ -14,6 +14,24 @@ import (
|
|||
"banger/internal/system"
|
||||
)
|
||||
|
||||
// jailerOpts returns the jailer launch options to bundle in the firecracker
|
||||
// launch request, or nil when the jailer is disabled or misconfigured.
|
||||
// nil makes the launch fall back to the legacy direct-firecracker path.
|
||||
func (s *VMService) jailerOpts() *roothelper.JailerLaunchOpts {
|
||||
if !s.config.JailerEnabled {
|
||||
return nil
|
||||
}
|
||||
if strings.TrimSpace(s.config.JailerBin) == "" || strings.TrimSpace(s.config.JailerChrootBase) == "" {
|
||||
return nil
|
||||
}
|
||||
return &roothelper.JailerLaunchOpts{
|
||||
Binary: s.config.JailerBin,
|
||||
ChrootBaseDir: s.config.JailerChrootBase,
|
||||
UID: os.Getuid(),
|
||||
GID: os.Getgid(),
|
||||
}
|
||||
}
|
||||
|
||||
// buildKernelArgs assembles the kernel command line for a start.
|
||||
// Direct-boot images (no initrd) get kernel-level IP config so the
|
||||
// network is up before init, plus init= pointing at the universal
|
||||
|
|
@ -344,6 +362,7 @@ func (s *VMService) buildStartSteps(op *operationLog, sc *startContext) []startS
|
|||
VSockCID: sc.vm.Runtime.VSockCID,
|
||||
VCPUCount: sc.vm.Spec.VCPUCount,
|
||||
MemoryMiB: sc.vm.Spec.MemoryMiB,
|
||||
Jailer: s.jailerOpts(),
|
||||
}
|
||||
machineConfig := firecracker.MachineConfig{Drives: launchReq.Drives}
|
||||
s.capHooks.contributeMachine(&machineConfig, *sc.vm, sc.image)
|
||||
|
|
|
|||
|
|
@ -6,6 +6,8 @@ import (
|
|||
"log/slog"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
|
|
@ -32,8 +34,34 @@ type MachineConfig struct {
|
|||
VCPUCount int
|
||||
MemoryMiB int
|
||||
Logger *slog.Logger
|
||||
// Jailer, when non-nil, wraps firecracker in `jailer`. Path fields
|
||||
// (SocketPath, KernelImagePath, InitrdPath, Drives[].Path, VSockPath)
|
||||
// MUST be pre-translated by the caller: SocketPath/VSockPath as
|
||||
// host-visible chroot paths; the rest as chroot-internal paths
|
||||
// (jailer chroots before exec, so firecracker resolves them inside
|
||||
// the chroot).
|
||||
Jailer *JailerOpts
|
||||
}
|
||||
|
||||
// JailerOpts captures the jailer-specific knobs. The chroot tree at
|
||||
// `<ChrootBaseDir>/firecracker/<VMID>/root/` and the kernel/drive nodes
|
||||
// inside it must be staged by the caller before NewMachine — this
|
||||
// package only constructs the launch cmd.
|
||||
type JailerOpts struct {
|
||||
Binary string
|
||||
ChrootBaseDir string
|
||||
UID int
|
||||
GID int
|
||||
}
|
||||
|
||||
// JailerSocketName is the chroot-relative API socket path passed to
|
||||
// firecracker via --api-sock. Lives at the chroot root (no /run/ subdir
|
||||
// required) so we don't depend on jailer creating intermediate dirs.
|
||||
const JailerSocketName = "/firecracker.socket"
|
||||
|
||||
// JailerVSockName mirrors JailerSocketName for the vsock UDS.
|
||||
const JailerVSockName = "/vsock.sock"
|
||||
|
||||
type DriveConfig struct {
|
||||
ID string
|
||||
Path string
|
||||
|
|
@ -74,6 +102,13 @@ func NewMachine(ctx context.Context, cfg MachineConfig) (*Machine, error) {
|
|||
return &Machine{machine: machine, logFile: logFile}, nil
|
||||
}
|
||||
|
||||
// JailerChrootRoot returns the host-visible path to the jailer chroot
|
||||
// root for vmid under base. Mirrors the layout firecracker's jailer
|
||||
// builds: <base>/firecracker/<vmid>/root.
|
||||
func JailerChrootRoot(base, vmid string) string {
|
||||
return filepath.Join(base, "firecracker", vmid, "root")
|
||||
}
|
||||
|
||||
func (m *Machine) Start(ctx context.Context) error {
|
||||
// The caller's ctx is INTENTIONALLY not forwarded to the SDK.
|
||||
// firecracker-go-sdk's startVMM (machine.go) spawns a goroutine
|
||||
|
|
@ -141,7 +176,7 @@ func buildConfig(cfg MachineConfig) sdk.Config {
|
|||
}
|
||||
drives := drivesBuilder.Build()
|
||||
|
||||
return sdk.Config{
|
||||
out := sdk.Config{
|
||||
SocketPath: cfg.SocketPath,
|
||||
LogPath: cfg.LogPath,
|
||||
MetricsPath: cfg.MetricsPath,
|
||||
|
|
@ -162,6 +197,18 @@ func buildConfig(cfg MachineConfig) sdk.Config {
|
|||
},
|
||||
VMID: cfg.VMID,
|
||||
}
|
||||
if cfg.Jailer != nil {
|
||||
// The path fields above are already chroot-translated by the
|
||||
// caller (see MachineConfig.Jailer doc). Skip the SDK's host-side
|
||||
// existence checks — kernel/drives live inside the chroot, not
|
||||
// at the paths we report.
|
||||
out.DisableValidation = true
|
||||
// LogPath is the host-side file used only for cmd.Stderr capture.
|
||||
// Clearing it here prevents the SDK from sending PUT /logger with
|
||||
// a host path that firecracker can't open from inside the chroot.
|
||||
out.LogPath = ""
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func buildVsockDevices(cfg MachineConfig) []sdk.VsockDevice {
|
||||
|
|
@ -214,13 +261,26 @@ func defaultDriveID(drive DriveConfig, fallback string) string {
|
|||
// fcproc.EnsureSocketAccessForAsync immediately *before* Machine.Start so the
|
||||
// chown wins the race against the SDK's HTTP probe over the API socket. That
|
||||
// replaces the previous in-shell chown_watcher.
|
||||
//
|
||||
// When cfg.Jailer is set, the launch is wrapped by `jailer`. The chroot tree
|
||||
// MUST already be staged (kernel hard-linked, drives mknod'd, dirs chowned to
|
||||
// the configured UID:GID) — see fcproc.PrepareJailerChroot. The SDK's own
|
||||
// JailerCfg path is intentionally bypassed: it cannot mknod block devices and
|
||||
// does not expose --new-pid-ns.
|
||||
func buildProcessRunner(cfg MachineConfig, logFile *os.File) *exec.Cmd {
|
||||
args := []string{"--api-sock", cfg.SocketPath, "--id", cfg.VMID}
|
||||
var bin string
|
||||
var args []string
|
||||
if cfg.Jailer != nil {
|
||||
bin, args = jailerArgs(cfg)
|
||||
} else {
|
||||
bin = cfg.BinaryPath
|
||||
args = []string{"--api-sock", cfg.SocketPath, "--id", cfg.VMID}
|
||||
}
|
||||
var cmd *exec.Cmd
|
||||
if os.Geteuid() == 0 {
|
||||
cmd = exec.Command(cfg.BinaryPath, args...)
|
||||
cmd = exec.Command(bin, args...)
|
||||
} else {
|
||||
cmd = exec.Command("sudo", append([]string{"-n", "-E", cfg.BinaryPath}, args...)...)
|
||||
cmd = exec.Command("sudo", append([]string{"-n", "-E", bin}, args...)...)
|
||||
}
|
||||
cmd.Stdin = nil
|
||||
if logFile != nil {
|
||||
|
|
@ -230,6 +290,28 @@ func buildProcessRunner(cfg MachineConfig, logFile *os.File) *exec.Cmd {
|
|||
return cmd
|
||||
}
|
||||
|
||||
// jailerArgs returns the (binary, args) tuple for the jailer wrapper.
|
||||
// firecracker's flags are passed after `--`. --new-pid-ns gives the guest
|
||||
// VMM its own PID namespace; the SDK's JailerCommandBuilder doesn't expose
|
||||
// it in v1.0.0, which is the main reason this path doesn't go through
|
||||
// sdk.Config.JailerCfg.
|
||||
func jailerArgs(cfg MachineConfig) (string, []string) {
|
||||
args := []string{
|
||||
"--id", cfg.VMID,
|
||||
"--uid", strconv.Itoa(cfg.Jailer.UID),
|
||||
"--gid", strconv.Itoa(cfg.Jailer.GID),
|
||||
"--exec-file", cfg.BinaryPath,
|
||||
"--chroot-base-dir", cfg.Jailer.ChrootBaseDir,
|
||||
// "--new-pid-ns": jailer forks when creating the PID namespace; the
|
||||
// SDK tracks the parent's PID, which exits immediately, causing the
|
||||
// SDK's "process exited" goroutine to tear down the API socket while
|
||||
// firecracker is still booting in the child. Left out intentionally.
|
||||
"--",
|
||||
"--api-sock", JailerSocketName,
|
||||
}
|
||||
return cfg.Jailer.Binary, args
|
||||
}
|
||||
|
||||
func newLogger(base *slog.Logger) *logrus.Entry {
|
||||
logger := logrus.New()
|
||||
logger.SetOutput(io.Discard)
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@ const (
|
|||
DefaultStatsPollInterval = 10 * time.Second
|
||||
DefaultStaleSweepInterval = 1 * time.Minute
|
||||
MaxDiskBytes int64 = 128 * 1024 * 1024 * 1024
|
||||
DefaultJailerBinary = "/usr/bin/jailer"
|
||||
)
|
||||
|
||||
type VMState string
|
||||
|
|
@ -36,6 +37,9 @@ const (
|
|||
type DaemonConfig struct {
|
||||
LogLevel string
|
||||
FirecrackerBin string
|
||||
JailerBin string
|
||||
JailerEnabled bool
|
||||
JailerChrootBase string
|
||||
SSHKeyPath string
|
||||
HostHomeDir string
|
||||
AutoStopStaleAfter time.Duration
|
||||
|
|
|
|||
|
|
@ -47,6 +47,7 @@ const (
|
|||
methodKillProcess = "priv.kill_process"
|
||||
methodSignalProcess = "priv.signal_process"
|
||||
methodProcessRunning = "priv.process_running"
|
||||
methodCleanupJailerChroot = "priv.cleanup_jailer_chroot"
|
||||
rootfsDMNamePrefix = "fc-rootfs-"
|
||||
vmTapPrefix = "tap-fc-"
|
||||
tapPoolPrefix = "tap-pool-"
|
||||
|
|
@ -82,6 +83,18 @@ type FirecrackerLaunchRequest struct {
|
|||
VCPUCount int `json:"vcpu_count"`
|
||||
MemoryMiB int `json:"memory_mib"`
|
||||
Network NetworkConfig `json:"network"`
|
||||
Jailer *JailerLaunchOpts `json:"jailer,omitempty"`
|
||||
}
|
||||
|
||||
// JailerLaunchOpts mirrors firecracker.JailerOpts for the RPC wire. UID
|
||||
// and GID are the (un)privileged target the jailer drops to; the helper
|
||||
// enforces they match the registered owner so the daemon can't ask the
|
||||
// helper to run firecracker as an arbitrary user.
|
||||
type JailerLaunchOpts struct {
|
||||
Binary string `json:"binary"`
|
||||
ChrootBaseDir string `json:"chroot_base_dir"`
|
||||
UID int `json:"uid"`
|
||||
GID int `json:"gid"`
|
||||
}
|
||||
|
||||
type findPIDResult struct {
|
||||
|
|
@ -220,6 +233,13 @@ func (c *Client) LaunchFirecracker(ctx context.Context, req FirecrackerLaunchReq
|
|||
return result.PID, nil
|
||||
}
|
||||
|
||||
func (c *Client) CleanupJailerChroot(ctx context.Context, chrootRoot string) error {
|
||||
_, err := rpc.Call[struct{}](ctx, c.socketPath, methodCleanupJailerChroot, struct {
|
||||
ChrootRoot string `json:"chroot_root"`
|
||||
}{ChrootRoot: chrootRoot})
|
||||
return err
|
||||
}
|
||||
|
||||
func (c *Client) EnsureSocketAccess(ctx context.Context, socketPath, label string) error {
|
||||
_, err := rpc.Call[struct{}](ctx, c.socketPath, methodEnsureSocketAccess, struct {
|
||||
SocketPath string `json:"socket_path"`
|
||||
|
|
@ -589,6 +609,19 @@ func (s *Server) dispatch(ctx context.Context, req rpc.Request) rpc.Response {
|
|||
return rpc.NewError("bad_params", err.Error())
|
||||
}
|
||||
return marshalResultOrError(processRunningResult{Running: system.ProcessRunning(params.PID, params.APISock)}, nil)
|
||||
case methodCleanupJailerChroot:
|
||||
params, err := rpc.DecodeParams[struct {
|
||||
ChrootRoot string `json:"chroot_root"`
|
||||
}](req)
|
||||
if err != nil {
|
||||
return rpc.NewError("bad_params", err.Error())
|
||||
}
|
||||
systemLayout := paths.ResolveSystem()
|
||||
if err := s.validateManagedPath(params.ChrootRoot, systemLayout.StateDir, systemLayout.RuntimeDir); err != nil {
|
||||
return rpc.NewError("invalid_path", err.Error())
|
||||
}
|
||||
err = fcproc.New(s.runner, fcproc.Config{}, s.logger).CleanupJailerChroot(ctx, params.ChrootRoot)
|
||||
return marshalResultOrError(struct{}{}, err)
|
||||
default:
|
||||
return rpc.NewError("unknown_method", req.Method)
|
||||
}
|
||||
|
|
@ -718,7 +751,59 @@ func (s *Server) launchFirecracker(ctx context.Context, req FirecrackerLaunchReq
|
|||
return 0, err
|
||||
}
|
||||
}
|
||||
machine, err := firecracker.NewMachine(ctx, firecracker.MachineConfig{
|
||||
mgr := fcproc.New(s.runner, fcproc.Config{BridgeName: req.Network.BridgeName, BridgeIP: req.Network.BridgeIP, CIDR: req.Network.CIDR}, s.logger)
|
||||
mc, err := s.buildLaunchMachineConfig(ctx, req, systemLayout, mgr)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
// Pre-Start symlink: see localPrivilegedOps.LaunchFirecracker for
|
||||
// the AF_UNIX sun_path-length rationale.
|
||||
if err := s.exposeJailerSockets(req); err != nil {
|
||||
return 0, fmt.Errorf("expose jailer sockets: %w", err)
|
||||
}
|
||||
machine, err := firecracker.NewMachine(ctx, mc)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
if err := machine.Start(ctx); err != nil {
|
||||
if pid := mgr.ResolvePID(context.Background(), machine, mc.SocketPath); pid > 0 {
|
||||
_, _ = s.runner.Run(context.Background(), "kill", "-KILL", strconv.Itoa(pid))
|
||||
}
|
||||
return 0, err
|
||||
}
|
||||
if req.Jailer == nil {
|
||||
// Belt-and-suspenders only on the legacy direct-firecracker path;
|
||||
// the jailer drops to the configured uid before creating the
|
||||
// socket, so its perms are correct by construction.
|
||||
if err := mgr.EnsureSocketAccessFor(ctx, mc.SocketPath, "firecracker api socket", s.meta.OwnerUID, s.meta.OwnerGID); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
if strings.TrimSpace(mc.VSockPath) != "" {
|
||||
if err := mgr.EnsureSocketAccessFor(ctx, mc.VSockPath, "firecracker vsock socket", s.meta.OwnerUID, s.meta.OwnerGID); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
}
|
||||
pid := mgr.ResolvePID(context.Background(), machine, mc.SocketPath)
|
||||
if pid <= 0 {
|
||||
return 0, errors.New("firecracker started but pid could not be resolved")
|
||||
}
|
||||
return pid, nil
|
||||
}
|
||||
|
||||
// buildLaunchMachineConfig assembles the firecracker.MachineConfig used by
|
||||
// launchFirecracker, performing the chroot staging when jailer is enabled.
|
||||
// In the non-jailer case it's a straight field copy from the request.
|
||||
//
|
||||
// In the jailer case it:
|
||||
// - validates JailerLaunchOpts (binary executable, chroot under RuntimeDir,
|
||||
// uid/gid match the registered owner — the daemon can't ask the helper to
|
||||
// drop firecracker into an arbitrary uid)
|
||||
// - calls fcproc.PrepareJailerChroot to build the chroot tree
|
||||
// - rewrites SocketPath and VSockPath to host-visible chroot paths and
|
||||
// KernelImagePath/InitrdPath/Drives[].Path to chroot-internal names
|
||||
func (s *Server) buildLaunchMachineConfig(ctx context.Context, req FirecrackerLaunchRequest, layout paths.Layout, mgr *fcproc.Manager) (firecracker.MachineConfig, error) {
|
||||
mc := firecracker.MachineConfig{
|
||||
BinaryPath: req.BinaryPath,
|
||||
VMID: req.VMID,
|
||||
SocketPath: req.SocketPath,
|
||||
|
|
@ -734,31 +819,120 @@ func (s *Server) launchFirecracker(ctx context.Context, req FirecrackerLaunchReq
|
|||
VCPUCount: req.VCPUCount,
|
||||
MemoryMiB: req.MemoryMiB,
|
||||
Logger: s.logger,
|
||||
})
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
if err := machine.Start(ctx); err != nil {
|
||||
manager := fcproc.New(s.runner, fcproc.Config{BridgeName: req.Network.BridgeName, BridgeIP: req.Network.BridgeIP, CIDR: req.Network.CIDR}, s.logger)
|
||||
if pid := manager.ResolvePID(context.Background(), machine, req.SocketPath); pid > 0 {
|
||||
_, _ = s.runner.Run(context.Background(), "kill", "-KILL", strconv.Itoa(pid))
|
||||
}
|
||||
return 0, err
|
||||
if req.Jailer == nil {
|
||||
return mc, nil
|
||||
}
|
||||
manager := fcproc.New(s.runner, fcproc.Config{BridgeName: req.Network.BridgeName, BridgeIP: req.Network.BridgeIP, CIDR: req.Network.CIDR}, s.logger)
|
||||
if err := manager.EnsureSocketAccessFor(ctx, req.SocketPath, "firecracker api socket", s.meta.OwnerUID, s.meta.OwnerGID); err != nil {
|
||||
return 0, err
|
||||
if err := s.validateJailerOpts(*req.Jailer, layout); err != nil {
|
||||
return firecracker.MachineConfig{}, err
|
||||
}
|
||||
chrootRoot := firecracker.JailerChrootRoot(req.Jailer.ChrootBaseDir, req.VMID)
|
||||
driveSpecs := make([]fcproc.ChrootDriveSpec, 0, len(req.Drives))
|
||||
chrootDrives := make([]firecracker.DriveConfig, 0, len(req.Drives))
|
||||
for _, d := range req.Drives {
|
||||
name := chrootDriveName(d)
|
||||
driveSpecs = append(driveSpecs, fcproc.ChrootDriveSpec{ChrootName: name, HostPath: d.Path})
|
||||
chrootDrives = append(chrootDrives, firecracker.DriveConfig{
|
||||
ID: d.ID,
|
||||
Path: "/" + name,
|
||||
ReadOnly: d.ReadOnly,
|
||||
IsRoot: d.IsRoot,
|
||||
})
|
||||
}
|
||||
wantVSock := strings.TrimSpace(req.VSockPath) != ""
|
||||
if err := mgr.PrepareJailerChroot(ctx, chrootRoot,
|
||||
req.Jailer.UID, req.Jailer.GID,
|
||||
req.BinaryPath,
|
||||
req.KernelImagePath, "vmlinux",
|
||||
req.InitrdPath, "initrd",
|
||||
driveSpecs, wantVSock,
|
||||
); err != nil {
|
||||
return firecracker.MachineConfig{}, fmt.Errorf("prepare jailer chroot: %w", err)
|
||||
}
|
||||
// See localPrivilegedOps.buildLaunchMachineConfig for why SocketPath
|
||||
// stays the short req path but VSockPath becomes chroot-internal.
|
||||
_ = chrootRoot
|
||||
if wantVSock {
|
||||
mc.VSockPath = firecracker.JailerVSockName
|
||||
}
|
||||
mc.KernelImagePath = "/vmlinux"
|
||||
if strings.TrimSpace(req.InitrdPath) != "" {
|
||||
mc.InitrdPath = "/initrd"
|
||||
} else {
|
||||
mc.InitrdPath = ""
|
||||
}
|
||||
mc.Drives = chrootDrives
|
||||
// LogPath stays set so buildProcessRunner's openLogFile captures firecracker
|
||||
// stderr via cmd.Stderr. buildConfig clears sdk.Config.LogPath for jailer
|
||||
// mode to avoid PUT /logger with a host path firecracker can't open.
|
||||
mc.MetricsPath = ""
|
||||
mc.Jailer = &firecracker.JailerOpts{
|
||||
Binary: req.Jailer.Binary,
|
||||
ChrootBaseDir: req.Jailer.ChrootBaseDir,
|
||||
UID: req.Jailer.UID,
|
||||
GID: req.Jailer.GID,
|
||||
}
|
||||
return mc, nil
|
||||
}
|
||||
|
||||
func (s *Server) validateJailerOpts(opts JailerLaunchOpts, layout paths.Layout) error {
|
||||
if err := validateRootExecutable(opts.Binary); err != nil {
|
||||
return fmt.Errorf("jailer binary: %w", err)
|
||||
}
|
||||
// Chroot base must live under StateDir so hard-links into the chroot
|
||||
// share a filesystem with the image cache (RuntimeDir is tmpfs and
|
||||
// would EXDEV on os.Link). RuntimeDir is also accepted because the
|
||||
// jailer is happy on tmpfs when the kernel/drives happen to colocate
|
||||
// (e.g. tests).
|
||||
if err := s.validateManagedPath(opts.ChrootBaseDir, layout.StateDir, layout.RuntimeDir); err != nil {
|
||||
return fmt.Errorf("jailer chroot base: %w", err)
|
||||
}
|
||||
if opts.UID != s.meta.OwnerUID || opts.GID != s.meta.OwnerGID {
|
||||
return fmt.Errorf("jailer uid/gid (%d:%d) must match registered owner (%d:%d)", opts.UID, opts.GID, s.meta.OwnerUID, s.meta.OwnerGID)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// exposeJailerSockets makes the chroot-internal sockets reachable at the
|
||||
// host paths the daemon already references (sc.apiSock, vm.Runtime.VSockPath).
|
||||
// AF_UNIX connect(2) follows symlinks, so a symlink keeps the rest of the
|
||||
// daemon code unchanged. Computes both host targets from the chroot root and
|
||||
// the chroot-internal name, so the API socket and the vsock socket stay in
|
||||
// sync regardless of how the launch request laid them out.
|
||||
func (s *Server) exposeJailerSockets(req FirecrackerLaunchRequest) error {
|
||||
if req.Jailer == nil {
|
||||
return nil
|
||||
}
|
||||
chrootRoot := firecracker.JailerChrootRoot(req.Jailer.ChrootBaseDir, req.VMID)
|
||||
hostAPI := filepath.Join(chrootRoot, strings.TrimPrefix(firecracker.JailerSocketName, "/"))
|
||||
if err := atomicSymlink(hostAPI, req.SocketPath); err != nil {
|
||||
return fmt.Errorf("api socket symlink: %w", err)
|
||||
}
|
||||
if strings.TrimSpace(req.VSockPath) != "" {
|
||||
if err := manager.EnsureSocketAccessFor(ctx, req.VSockPath, "firecracker vsock socket", s.meta.OwnerUID, s.meta.OwnerGID); err != nil {
|
||||
return 0, err
|
||||
hostVSock := filepath.Join(chrootRoot, strings.TrimPrefix(firecracker.JailerVSockName, "/"))
|
||||
if err := atomicSymlink(hostVSock, req.VSockPath); err != nil {
|
||||
return fmt.Errorf("vsock symlink: %w", err)
|
||||
}
|
||||
}
|
||||
pid := manager.ResolvePID(context.Background(), machine, req.SocketPath)
|
||||
if pid <= 0 {
|
||||
return 0, errors.New("firecracker started but pid could not be resolved")
|
||||
return nil
|
||||
}
|
||||
|
||||
func atomicSymlink(target, link string) error {
|
||||
if err := os.Remove(link); err != nil && !os.IsNotExist(err) {
|
||||
return err
|
||||
}
|
||||
return pid, nil
|
||||
return os.Symlink(target, link)
|
||||
}
|
||||
|
||||
// chrootDriveName returns the bare filename a drive should appear as inside
|
||||
// the chroot. We use the drive ID when present (rootfs, work, …) so the
|
||||
// chroot listing is self-explanatory; falling back to the source's basename
|
||||
// covers the unnamed case.
|
||||
func chrootDriveName(d firecracker.DriveConfig) string {
|
||||
if id := strings.TrimSpace(d.ID); id != "" {
|
||||
return id
|
||||
}
|
||||
return filepath.Base(d.Path)
|
||||
}
|
||||
|
||||
func (s *Server) validateLaunchDrivePath(drive firecracker.DriveConfig, stateDir string) error {
|
||||
|
|
|
|||
|
|
@ -172,7 +172,19 @@ func ProcessRunning(pid int, apiSock string) bool {
|
|||
return false
|
||||
}
|
||||
cmdline := strings.ReplaceAll(string(data), "\x00", " ")
|
||||
return strings.Contains(cmdline, "firecracker") && strings.Contains(cmdline, apiSock)
|
||||
if !strings.Contains(cmdline, "firecracker") {
|
||||
return false
|
||||
}
|
||||
if strings.Contains(cmdline, apiSock) {
|
||||
return true
|
||||
}
|
||||
// Jailer mode: apiSock is a symlink; firecracker's cmdline has the
|
||||
// chroot-internal path (e.g. "/firecracker.socket"), not the host path.
|
||||
// Fall back to matching the symlink target's base name.
|
||||
if target, err := os.Readlink(apiSock); err == nil {
|
||||
return strings.Contains(cmdline, filepath.Base(target))
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
type ProcessStats struct {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue