Defence-in-depth pass over every helper method that touches the host
as root. Each fix narrows what a compromised owner-uid daemon could
ask the helper to do; many close concrete file-ownership and DoS
primitives that the previous validators didn't reach.
Path / identifier validation:
* priv.fsck_snapshot now requires /dev/mapper/fc-rootfs-* (was
"is the string non-empty"). e2fsck -fy on /dev/sda1 was the
motivating exploit.
* priv.kill_process and priv.signal_process now read
/proc/<pid>/cmdline and require a "firecracker" substring before
sending the signal. Killing arbitrary host PIDs (sshd, init, …)
is no longer a one-RPC primitive.
* priv.read_ext4_file and priv.write_ext4_files now require the
image path to live under StateDir or be /dev/mapper/fc-rootfs-*.
* priv.cleanup_dm_snapshot validates every non-empty Handles field:
DM name fc-rootfs-*, DM device /dev/mapper/fc-rootfs-*, loops
/dev/loopN.
* priv.remove_dm_snapshot accepts only fc-rootfs-* names or
/dev/mapper/fc-rootfs-* paths.
* priv.ensure_nat now requires a parsable IPv4 address and a
banger-prefixed tap.
* priv.sync_resolver_routing and priv.clear_resolver_routing now
require a Linux iface-name-shaped bridge name (1–15 chars, no
whitespace/'/'/':') and, for sync, a parsable resolver address.
Symlink defence:
* priv.ensure_socket_access now validates the socket path is under
RuntimeDir and not a symlink. The fcproc layer's chown/chmod
moves to unix.Open(O_PATH|O_NOFOLLOW) + Fchownat(AT_EMPTY_PATH)
+ Fchmodat via /proc/self/fd, so even a swap of the leaf into a
symlink between validation and the syscall is refused. The
local-priv (non-root) fallback uses `chown -h`.
* priv.cleanup_jailer_chroot rejects symlinks at both the leaf
(os.Lstat) and intermediate path components (filepath.EvalSymlinks
+ clean-equality). The umount sweep was rewritten from shell
`umount --recursive --lazy` to direct unix.Unmount(MNT_DETACH |
UMOUNT_NOFOLLOW) per child mount, deepest-first; the findmnt
guard remains as the rm-rf safety net. Local-priv mode falls
back to `sudo umount --lazy`.
Binary validation:
* validateRootExecutable now opens with O_PATH|O_NOFOLLOW and
Fstats through the resulting fd. Rejects path-level symlinks and
narrows the TOCTOU window between validation and the SDK's exec
to fork+exec time on a healthy host.
Daemon socket:
* The owner daemon now reads SO_PEERCRED on every accepted
connection and refuses any UID that isn't 0 or the registered
owner. Filesystem perms (0600 + ownerUID) already enforced this;
the check is belt-and-braces in case the socket FD is ever
leaked to a non-owner process.
Docs:
* docs/privileges.md walked end-to-end. Each helper RPC's
Validation gate row reflects what the code actually enforces.
New section "Running outside the system install" calls out the
looser dev-mode trust model (NOPASSWD sudoers, helper hardening
bypassed) so users don't deploy that path on shared hosts.
Trust list updated to include every new validator.
Tests added: validators (DM-loop, DM-remove-target, DM-handles,
ext4-image-path, iface-name, IPv4, resolver-addr, not-symlink,
firecracker-PID, root-executable variants), the daemon's authorize
path (non-unix conn rejection + unix conn happy path), the umount2
ordering contract (deepest-first + --lazy on the sudo branch), and
positive/negative cases for the chown-no-follow fallback.
Verified end-to-end via `make smoke JOBS=4` on a KVM host.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
700 lines
27 KiB
Go
700 lines
27 KiB
Go
// Package fcproc owns the host-side process primitives needed to launch,
|
|
// inspect, and tear down Firecracker VMs: bridge/tap setup, binary
|
|
// resolution, socket permissions, PID lookup, graceful and forceful
|
|
// shutdown. Shared by the VM lifecycle and image build paths so neither
|
|
// needs to import the other.
|
|
package fcproc
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"log/slog"
|
|
"os"
|
|
"path/filepath"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"syscall"
|
|
"time"
|
|
|
|
"golang.org/x/sys/unix"
|
|
|
|
"banger/internal/firecracker"
|
|
"banger/internal/system"
|
|
)
|
|
|
|
// ErrWaitForExitTimeout is returned by WaitForExit when the deadline passes
|
|
// before the process exits. Callers use errors.Is to detect it.
|
|
var ErrWaitForExitTimeout = errors.New("timed out waiting for VM to exit")
|
|
|
|
// Runner is the command-runner surface fcproc needs. system.Runner satisfies
|
|
// it.
|
|
type Runner interface {
|
|
Run(ctx context.Context, name string, args ...string) ([]byte, error)
|
|
RunSudo(ctx context.Context, args ...string) ([]byte, error)
|
|
}
|
|
|
|
// Config captures the host networking + runtime paths fcproc operations need.
|
|
type Config struct {
|
|
FirecrackerBin string
|
|
BridgeName string
|
|
BridgeIP string
|
|
CIDR string
|
|
RuntimeDir string
|
|
}
|
|
|
|
// Manager owns the shared configuration + runner and exposes the per-process
|
|
// helpers. Stateless beyond its dependencies — safe to share.
|
|
type Manager struct {
|
|
runner Runner
|
|
cfg Config
|
|
logger *slog.Logger
|
|
}
|
|
|
|
// New returns a Manager that issues commands through runner using cfg.
|
|
func New(runner Runner, cfg Config, logger *slog.Logger) *Manager {
|
|
return &Manager{runner: runner, cfg: cfg, logger: logger}
|
|
}
|
|
|
|
// EnsureBridge makes sure the host bridge exists and is up.
|
|
func (m *Manager) EnsureBridge(ctx context.Context) error {
|
|
if _, err := m.runner.Run(ctx, "ip", "link", "show", m.cfg.BridgeName); err == nil {
|
|
_, err = m.runner.RunSudo(ctx, "ip", "link", "set", m.cfg.BridgeName, "up")
|
|
return err
|
|
}
|
|
if _, err := m.runner.RunSudo(ctx, "ip", "link", "add", "name", m.cfg.BridgeName, "type", "bridge"); err != nil {
|
|
return err
|
|
}
|
|
if _, err := m.runner.RunSudo(ctx, "ip", "addr", "add", fmt.Sprintf("%s/%s", m.cfg.BridgeIP, m.cfg.CIDR), "dev", m.cfg.BridgeName); err != nil {
|
|
return err
|
|
}
|
|
_, err := m.runner.RunSudo(ctx, "ip", "link", "set", m.cfg.BridgeName, "up")
|
|
return err
|
|
}
|
|
|
|
// EnsureSocketDir creates the runtime socket directory at 0700. This is
|
|
// the directory the daemon socket, per-VM firecracker API sockets, and
|
|
// vsock sockets all live inside, so it must be readable only by the
|
|
// invoking user.
|
|
func (m *Manager) EnsureSocketDir() error {
|
|
mode := os.FileMode(0o700)
|
|
if os.Geteuid() == 0 {
|
|
mode = 0o711
|
|
}
|
|
if err := os.MkdirAll(m.cfg.RuntimeDir, mode); err != nil {
|
|
return err
|
|
}
|
|
return os.Chmod(m.cfg.RuntimeDir, mode)
|
|
}
|
|
|
|
// CreateTap (re)creates a TAP owned by the current uid/gid, attaches it to
|
|
// the bridge, and brings both up.
|
|
func (m *Manager) CreateTap(ctx context.Context, tap string) error {
|
|
return m.CreateTapOwned(ctx, tap, os.Getuid(), os.Getgid())
|
|
}
|
|
|
|
// CreateTapOwned (re)creates a TAP owned by uid:gid, attaches it to the
|
|
// bridge, and brings both up.
|
|
func (m *Manager) CreateTapOwned(ctx context.Context, tap string, uid, gid int) error {
|
|
if _, err := m.runner.Run(ctx, "ip", "link", "show", tap); err == nil {
|
|
_, _ = m.runner.RunSudo(ctx, "ip", "link", "del", tap)
|
|
}
|
|
if _, err := m.runner.RunSudo(ctx, "ip", "tuntap", "add", "dev", tap, "mode", "tap", "user", strconv.Itoa(uid), "group", strconv.Itoa(gid)); err != nil {
|
|
return err
|
|
}
|
|
if _, err := m.runner.RunSudo(ctx, "ip", "link", "set", tap, "master", m.cfg.BridgeName); err != nil {
|
|
return err
|
|
}
|
|
if _, err := m.runner.RunSudo(ctx, "ip", "link", "set", tap, "up"); err != nil {
|
|
return err
|
|
}
|
|
_, err := m.runner.RunSudo(ctx, "ip", "link", "set", m.cfg.BridgeName, "up")
|
|
return err
|
|
}
|
|
|
|
// ResolveBinary returns the path to the firecracker binary: either an
|
|
// absolute path from config, or the first hit on PATH.
|
|
func (m *Manager) ResolveBinary() (string, error) {
|
|
if m.cfg.FirecrackerBin == "" {
|
|
return "", fmt.Errorf("firecracker binary not configured; install firecracker or set firecracker_bin")
|
|
}
|
|
path := m.cfg.FirecrackerBin
|
|
if strings.ContainsRune(path, os.PathSeparator) {
|
|
if _, err := os.Stat(path); err != nil {
|
|
return "", fmt.Errorf("firecracker binary not found at %s; install firecracker or set firecracker_bin", path)
|
|
}
|
|
return path, nil
|
|
}
|
|
resolved, err := system.LookupExecutable(path)
|
|
if err != nil {
|
|
return "", fmt.Errorf("firecracker binary %q not found in PATH; install firecracker or set firecracker_bin", path)
|
|
}
|
|
return resolved, nil
|
|
}
|
|
|
|
// EnsureSocketAccess waits for the socket to appear then chowns/chmods it to
|
|
// the current uid/gid, mode 0600.
|
|
func (m *Manager) EnsureSocketAccess(ctx context.Context, socketPath, label string) error {
|
|
return m.EnsureSocketAccessFor(ctx, socketPath, label, os.Getuid(), os.Getgid())
|
|
}
|
|
|
|
// EnsureSocketAccessFor waits for the socket to appear then chowns/chmods it
|
|
// to uid:gid, mode 0600.
|
|
func (m *Manager) EnsureSocketAccessFor(ctx context.Context, socketPath, label string, uid, gid int) error {
|
|
return m.ensureSocketAccessFor(ctx, socketPath, label, uid, gid, 5*time.Second, 100*time.Millisecond)
|
|
}
|
|
|
|
// EnsureSocketAccessForAsync runs EnsureSocketAccessFor concurrently for each
|
|
// non-empty path and returns a channel that receives a single error (nil on
|
|
// full success) once all per-path operations complete. Caller MUST receive on
|
|
// the channel to unblock the goroutine.
|
|
//
|
|
// Used during firecracker boot: the SDK's HTTP probe inside Machine.Start
|
|
// connects to the API socket the moment it appears. When firecracker is
|
|
// launched under sudo the socket is created root-owned, and the daemon's
|
|
// connect(2) gets EACCES until something chowns it. Running the chown
|
|
// concurrently with Start (instead of after Start returns, which deadlocks)
|
|
// closes the race without a shell-level chown_watcher.
|
|
//
|
|
// Uses a 25ms poll cadence (vs 100ms for the synchronous variant) to win
|
|
// against the SDK's tight HTTP retry loop.
|
|
func (m *Manager) EnsureSocketAccessForAsync(ctx context.Context, socketPaths []string, uid, gid int) <-chan error {
|
|
var clean []string
|
|
for _, p := range socketPaths {
|
|
if strings.TrimSpace(p) != "" {
|
|
clean = append(clean, p)
|
|
}
|
|
}
|
|
done := make(chan error, 1)
|
|
if len(clean) == 0 {
|
|
done <- nil
|
|
close(done)
|
|
return done
|
|
}
|
|
go func() {
|
|
defer close(done)
|
|
var wg sync.WaitGroup
|
|
errCh := make(chan error, len(clean))
|
|
for _, p := range clean {
|
|
wg.Add(1)
|
|
go func(path string) {
|
|
defer wg.Done()
|
|
if err := m.ensureSocketAccessFor(ctx, path, "firecracker socket", uid, gid, 3*time.Second, 25*time.Millisecond); err != nil {
|
|
errCh <- err
|
|
}
|
|
}(p)
|
|
}
|
|
wg.Wait()
|
|
close(errCh)
|
|
for err := range errCh {
|
|
if err != nil {
|
|
done <- err
|
|
return
|
|
}
|
|
}
|
|
done <- nil
|
|
}()
|
|
return done
|
|
}
|
|
|
|
func (m *Manager) ensureSocketAccessFor(ctx context.Context, socketPath, label string, uid, gid int, timeout, interval time.Duration) error {
|
|
if err := pollPath(ctx, socketPath, timeout, interval, label); err != nil {
|
|
return err
|
|
}
|
|
return chownChmodNoFollow(ctx, m.runner, socketPath, uid, gid, 0o600)
|
|
}
|
|
|
|
// chownChmodNoFollow sets owner/group/mode on path without following
|
|
// symlinks at the leaf. Required because the helper RPCs that drive
|
|
// socket access run as root: a follow-symlink chmod/chown becomes an
|
|
// arbitrary file-ownership primitive if the caller can plant a symlink
|
|
// at the target.
|
|
//
|
|
// Linux idiom: open with O_PATH|O_NOFOLLOW (errors out if the leaf is a
|
|
// symlink), Fstat the fd to confirm the file is a unix socket, then
|
|
// chown via Fchownat(AT_EMPTY_PATH) and chmod via /proc/self/fd/N
|
|
// (fchmod on an O_PATH fd returns EBADF, but the /proc path resolves
|
|
// straight back to the inode the fd already pins, so no leaf re-traversal
|
|
// happens).
|
|
//
|
|
// Falls back to `sudo chown -h` + `sudo chmod` for the local-priv mode
|
|
// where the daemon isn't root and can't issue the syscalls itself; the
|
|
// `-h` flag still avoids the symlink-follow on the chown side.
|
|
func chownChmodNoFollow(ctx context.Context, runner Runner, path string, uid, gid int, mode os.FileMode) error {
|
|
if os.Geteuid() != 0 {
|
|
// Mode-then-owner ordering preserves the pre-existing failure
|
|
// semantics of the legacy `chmod 600 / chown` shell-out path
|
|
// (chmod-failure tests expect chown to be skipped). `chown -h`
|
|
// keeps the symlink-no-follow guarantee on this branch.
|
|
if _, err := runner.RunSudo(ctx, "chmod", fmt.Sprintf("%o", mode.Perm()), path); err != nil {
|
|
return err
|
|
}
|
|
_, err := runner.RunSudo(ctx, "chown", "-h", fmt.Sprintf("%d:%d", uid, gid), path)
|
|
return err
|
|
}
|
|
fd, err := unix.Open(path, unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0)
|
|
if err != nil {
|
|
return fmt.Errorf("open %s: %w", path, err)
|
|
}
|
|
defer unix.Close(fd)
|
|
var st unix.Stat_t
|
|
if err := unix.Fstat(fd, &st); err != nil {
|
|
return fmt.Errorf("fstat %s: %w", path, err)
|
|
}
|
|
if st.Mode&unix.S_IFMT != unix.S_IFSOCK {
|
|
return fmt.Errorf("%s is not a unix socket (mode %#o)", path, st.Mode&unix.S_IFMT)
|
|
}
|
|
procPath := "/proc/self/fd/" + strconv.Itoa(fd)
|
|
if err := unix.Fchmodat(unix.AT_FDCWD, procPath, uint32(mode.Perm()), 0); err != nil {
|
|
return fmt.Errorf("chmod %s: %w", path, err)
|
|
}
|
|
if err := unix.Fchownat(fd, "", uid, gid, unix.AT_EMPTY_PATH); err != nil {
|
|
return fmt.Errorf("chown %s: %w", path, err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// FindPID returns the PID of the firecracker process listening on apiSock,
|
|
// located via pgrep.
|
|
func (m *Manager) FindPID(ctx context.Context, apiSock string) (int, error) {
|
|
out, err := m.runner.Run(ctx, "pgrep", "-n", "-f", apiSock)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
return strconv.Atoi(strings.TrimSpace(string(out)))
|
|
}
|
|
|
|
// ResolvePID prefers pgrep and falls back to the firecracker machine PID.
|
|
// Returns 0 if neither source yields a PID.
|
|
func (m *Manager) ResolvePID(ctx context.Context, machine *firecracker.Machine, apiSock string) int {
|
|
if pid, err := m.FindPID(ctx, apiSock); err == nil && pid > 0 {
|
|
return pid
|
|
}
|
|
if machine != nil {
|
|
if pid, err := machine.PID(); err == nil && pid > 0 {
|
|
return pid
|
|
}
|
|
}
|
|
return 0
|
|
}
|
|
|
|
// SendCtrlAltDel requests a graceful guest shutdown via the firecracker API
|
|
// socket.
|
|
func (m *Manager) SendCtrlAltDel(ctx context.Context, apiSock string) error {
|
|
if err := m.EnsureSocketAccess(ctx, apiSock, "firecracker api socket"); err != nil {
|
|
return err
|
|
}
|
|
client := firecracker.New(apiSock, m.logger)
|
|
return client.SendCtrlAltDel(ctx)
|
|
}
|
|
|
|
// WaitForExit polls until the process is gone or the timeout fires. Returns
|
|
// ErrWaitForExitTimeout on timeout, ctx.Err() on cancellation.
|
|
func (m *Manager) WaitForExit(ctx context.Context, pid int, apiSock string, timeout time.Duration) error {
|
|
deadline := time.Now().Add(timeout)
|
|
for {
|
|
if !system.ProcessRunning(pid, apiSock) {
|
|
return nil
|
|
}
|
|
if time.Now().After(deadline) {
|
|
return ErrWaitForExitTimeout
|
|
}
|
|
select {
|
|
case <-ctx.Done():
|
|
return ctx.Err()
|
|
case <-time.After(100 * time.Millisecond):
|
|
}
|
|
}
|
|
}
|
|
|
|
// Kill sends SIGKILL to pid.
|
|
func (m *Manager) Kill(ctx context.Context, pid int) error {
|
|
_, err := m.runner.RunSudo(ctx, "kill", "-KILL", strconv.Itoa(pid))
|
|
return err
|
|
}
|
|
|
|
// ChrootDriveSpec describes how a single drive should appear inside the
|
|
// jailer chroot. HostPath is the host-side source (a regular file or a
|
|
// /dev/mapper/* block device); ChrootName is the bare filename it should
|
|
// be reachable as inside the chroot (e.g. "rootfs"). The DM block device
|
|
// case is detected via os.Stat (S_IFBLK) — the helper mknods a matching
|
|
// node; everything else is hard-linked.
|
|
type ChrootDriveSpec struct {
|
|
ChrootName string
|
|
HostPath string
|
|
}
|
|
|
|
// PrepareJailerChroot stages the chroot tree at chrootRoot for the jailer
|
|
// to take over on launch. After this call:
|
|
//
|
|
// - chrootRoot exists, mode 0700, owned by uid:gid.
|
|
// - chrootRoot/<kernel-name> is a hard link of kernelHostPath, owned uid:gid.
|
|
// - chrootRoot/<initrd-name> is a hard link of initrdHostPath if set.
|
|
// - For each drive: a hard link (regular file source) or a freshly
|
|
// mknod'd block device with the source's major/minor (DM source).
|
|
// - If wantVSock, /dev/vhost-vsock is mknod'd into the chroot so
|
|
// firecracker can open it after chroot.
|
|
//
|
|
// All filesystem mutations go through runner.RunSudo when the caller isn't
|
|
// root, so this works in dev (sudo) and system (root helper) modes alike.
|
|
// Path components are validated by the caller (roothelper) — this helper
|
|
// trusts them.
|
|
func (m *Manager) PrepareJailerChroot(ctx context.Context, chrootRoot string, uid, gid int, firecrackerHostPath, kernelHostPath, kernelName, initrdHostPath, initrdName string, drives []ChrootDriveSpec, wantVSock bool) error {
|
|
if strings.TrimSpace(chrootRoot) == "" {
|
|
return fmt.Errorf("chroot root is required")
|
|
}
|
|
if err := m.sudo(ctx, "mkdir", "-p", chrootRoot); err != nil {
|
|
return fmt.Errorf("create chroot root: %w", err)
|
|
}
|
|
if err := m.sudo(ctx, "chmod", "0700", chrootRoot); err != nil {
|
|
return fmt.Errorf("chmod chroot root: %w", err)
|
|
}
|
|
if err := m.chown(ctx, chrootRoot, uid, gid); err != nil {
|
|
return fmt.Errorf("chown chroot root: %w", err)
|
|
}
|
|
// The daemon (uid) needs to traverse the intermediate directories to reach
|
|
// the sockets firecracker creates inside the chroot. The per-VM dir
|
|
// (<base>/firecracker/<vmid>/) is chowned to uid so the daemon can reach
|
|
// <vmid>/root/. The <base>/firecracker/ base and <base>/jail/ dirs get
|
|
// world-execute (--x) so any UID can traverse through them without listing
|
|
// their contents (the per-VM dirs are still protected by their own mode).
|
|
vmDir := filepath.Dir(chrootRoot)
|
|
if err := m.chown(ctx, vmDir, uid, gid); err != nil {
|
|
return fmt.Errorf("chown vm dir: %w", err)
|
|
}
|
|
fcBaseDir := filepath.Dir(vmDir)
|
|
if err := m.sudo(ctx, "chmod", "0711", fcBaseDir); err != nil {
|
|
return fmt.Errorf("chmod firecracker base dir: %w", err)
|
|
}
|
|
jailBaseDir := filepath.Dir(fcBaseDir)
|
|
if err := m.sudo(ctx, "chmod", "0711", jailBaseDir); err != nil {
|
|
return fmt.Errorf("chmod jail base dir: %w", err)
|
|
}
|
|
// Order matters: hard-link the kernel + file-backed drives BEFORE
|
|
// the self-bind below. link(2) refuses to cross mount points even
|
|
// when the underlying superblock is the same — once chrootRoot is a
|
|
// mount point, `ln /var/lib/.../kernel <chroot>/vmlinux` returns
|
|
// EXDEV.
|
|
if err := m.linkInto(ctx, chrootRoot, kernelHostPath, kernelName, uid, gid); err != nil {
|
|
return fmt.Errorf("link kernel: %w", err)
|
|
}
|
|
if strings.TrimSpace(initrdHostPath) != "" {
|
|
if err := m.linkInto(ctx, chrootRoot, initrdHostPath, initrdName, uid, gid); err != nil {
|
|
return fmt.Errorf("link initrd: %w", err)
|
|
}
|
|
}
|
|
for _, d := range drives {
|
|
if err := m.stageDrive(ctx, chrootRoot, d, uid, gid); err != nil {
|
|
return fmt.Errorf("stage drive %s: %w", d.ChrootName, err)
|
|
}
|
|
}
|
|
if wantVSock {
|
|
// The jailer creates /dev inside the chroot, but /dev/vhost-vsock must
|
|
// be pre-staged so firecracker can open it after the jailer chroots.
|
|
devDir := chrootRoot + "/dev"
|
|
if err := m.sudo(ctx, "mkdir", "-p", devDir); err != nil {
|
|
return fmt.Errorf("create chroot/dev: %w", err)
|
|
}
|
|
if err := m.chown(ctx, devDir, uid, gid); err != nil {
|
|
return fmt.Errorf("chown chroot/dev: %w", err)
|
|
}
|
|
if err := m.stageDevice(ctx, chrootRoot, "dev/vhost-vsock", "/dev/vhost-vsock", uid, gid); err != nil {
|
|
return fmt.Errorf("stage vhost-vsock: %w", err)
|
|
}
|
|
}
|
|
// Bind firecracker + the host libdirs into the chroot read-only.
|
|
// firecracker is dynamically linked (interpreter /lib64/ld-linux-*,
|
|
// libc, libgcc), and inside the chroot ENOENT on those is reported
|
|
// as "Failed to exec into Firecracker: No such file or directory" —
|
|
// the kernel's misleading ENOENT-for-missing-interpreter error.
|
|
//
|
|
// Done last so the link/mknod steps above don't have to cross the
|
|
// self-bind mount boundary (link(2) returns EXDEV at mount edges).
|
|
// Self-bind first so CleanupJailerChroot's `umount -lR` can recurse
|
|
// from chrootRoot itself; --make-private blocks propagation back to
|
|
// the host mount namespace.
|
|
// firecracker is copied (not bind-mounted) because jailer opens the
|
|
// binary O_RDWR — apparently to seal it or rewrite something — and
|
|
// fails with EROFS on a ro-bind.
|
|
chrootFC := chrootRoot + "/" + filepath.Base(firecrackerHostPath)
|
|
if err := m.sudo(ctx, "cp", "-f", firecrackerHostPath, chrootFC); err != nil {
|
|
return fmt.Errorf("copy firecracker into chroot: %w", err)
|
|
}
|
|
if err := m.sudo(ctx, "chmod", "0755", chrootFC); err != nil {
|
|
return fmt.Errorf("chmod firecracker in chroot: %w", err)
|
|
}
|
|
if err := m.chown(ctx, chrootFC, uid, gid); err != nil {
|
|
return fmt.Errorf("chown firecracker in chroot: %w", err)
|
|
}
|
|
if err := m.sudo(ctx, "mount", "--bind", chrootRoot, chrootRoot); err != nil {
|
|
return fmt.Errorf("self-bind chroot: %w", err)
|
|
}
|
|
// Remount without nosuid: the helper unit's ReadWritePaths binding marks
|
|
// /var/lib/banger nosuid, and bind mounts inherit that flag. The jailer
|
|
// needs to exec /firecracker as UID 1000, which the kernel denies on a
|
|
// nosuid mount when NoNewPrivileges is set on the unit.
|
|
if err := m.sudo(ctx, "mount", "-o", "remount,bind,suid", chrootRoot, chrootRoot); err != nil {
|
|
return fmt.Errorf("remount chroot suid: %w", err)
|
|
}
|
|
if err := m.sudo(ctx, "mount", "--make-private", chrootRoot); err != nil {
|
|
return fmt.Errorf("make-private chroot: %w", err)
|
|
}
|
|
// Pre-create /usr with world-traversable permissions. UMask=0077 on the
|
|
// helper unit causes plain mkdir to produce 0700 dirs; UID 1000 must be
|
|
// able to traverse /usr/ to reach the dynamic linker via lib64 → usr/lib.
|
|
if err := m.sudo(ctx, "install", "-d", "-m", "0755", chrootRoot+"/usr"); err != nil {
|
|
return fmt.Errorf("create chroot/usr: %w", err)
|
|
}
|
|
// Bind real libdirs and replicate the host's compat symlinks
|
|
// (/lib64 → /usr/lib, etc) inside the chroot so firecracker's
|
|
// PT_INTERP path (/lib64/ld-linux-*) resolves to the bound libs.
|
|
for _, libDir := range []string{"/usr/lib", "/usr/lib64", "/lib", "/lib64"} {
|
|
info, err := os.Lstat(libDir)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
target := chrootRoot + libDir
|
|
if info.Mode()&os.ModeSymlink != 0 {
|
|
link, err := os.Readlink(libDir)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
if err := m.sudo(ctx, "ln", "-sfn", link, target); err != nil {
|
|
return fmt.Errorf("symlink %s -> %s: %w", target, link, err)
|
|
}
|
|
continue
|
|
}
|
|
if !info.IsDir() {
|
|
continue
|
|
}
|
|
if err := m.bindDir(ctx, libDir, target, true); err != nil {
|
|
return fmt.Errorf("bind %s: %w", libDir, err)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// CleanupJailerChroot tears down a chroot built by PrepareJailerChroot:
|
|
// lazy-recursive umount of every mount under (or at) chrootRoot, then a
|
|
// findmnt-guarded `rm -rf`. The guard is load-bearing: if any bind mount
|
|
// remained, `rm -rf` would descend into the bind source (e.g. /usr/lib)
|
|
// and start deleting host files. The umount runs `-l` (lazy) so an in-use
|
|
// bind point still gets detached from the namespace; the guarded check
|
|
// then catches the rare case where detachment didn't happen.
|
|
func (m *Manager) CleanupJailerChroot(ctx context.Context, chrootRoot string) error {
|
|
if strings.TrimSpace(chrootRoot) == "" {
|
|
return nil
|
|
}
|
|
// Lstat (not Stat): if chrootRoot is a symlink the umount/rm shell-outs
|
|
// below would chase it. The handler-side validateNotSymlink also catches
|
|
// this, but lifting the check inside fcproc closes the TOCTOU window
|
|
// between the handler check and our umount command.
|
|
info, err := os.Lstat(chrootRoot)
|
|
if err != nil {
|
|
if os.IsNotExist(err) {
|
|
return nil
|
|
}
|
|
return fmt.Errorf("inspect chroot %s: %w", chrootRoot, err)
|
|
}
|
|
if info.Mode()&os.ModeSymlink != 0 {
|
|
return fmt.Errorf("refusing to clean up %q: path is a symlink", chrootRoot)
|
|
}
|
|
if !info.IsDir() {
|
|
return fmt.Errorf("refusing to clean up %q: not a directory", chrootRoot)
|
|
}
|
|
// Resolve any intermediate symlinks and require the result equals the
|
|
// input — that catches a planted `…/jail/firecracker/<vmid> → /` even
|
|
// though the leaf "/root" component is itself a real directory inside
|
|
// the redirected target. Equality + Lstat together cover both top and
|
|
// intermediate symlink shapes.
|
|
resolved, err := filepath.EvalSymlinks(chrootRoot)
|
|
if err != nil {
|
|
return fmt.Errorf("resolve chroot %s: %w", chrootRoot, err)
|
|
}
|
|
if filepath.Clean(resolved) != filepath.Clean(chrootRoot) {
|
|
return fmt.Errorf("refusing to clean up %q: resolves to %q via symlink", chrootRoot, resolved)
|
|
}
|
|
// Switch from `umount --recursive --lazy <chrootRoot>` (shell-resolved,
|
|
// follows symlinks at exec time) to direct umount2() syscalls per child
|
|
// mount with UMOUNT_NOFOLLOW. That fully closes the residual TOCTOU
|
|
// between the EvalSymlinks check above and the unmount: even if a daemon-
|
|
// uid attacker swapped a child mount's path to a symlink in the gap, the
|
|
// kernel refuses to follow it. The findmnt guard below still catches any
|
|
// mount we couldn't detach.
|
|
mounts, err := m.mountsUnder(ctx, chrootRoot)
|
|
if err != nil {
|
|
return fmt.Errorf("inspect chroot mounts: %w", err)
|
|
}
|
|
// Deepest-first so child mounts come off before parents; otherwise a
|
|
// parent unmount would EBUSY against in-use children.
|
|
sort.Slice(mounts, func(i, j int) bool {
|
|
return strings.Count(mounts[i], "/") > strings.Count(mounts[j], "/")
|
|
})
|
|
for _, mt := range mounts {
|
|
if err := m.detachMount(ctx, mt); err != nil {
|
|
return fmt.Errorf("detach %q: %w", mt, err)
|
|
}
|
|
}
|
|
if remaining, err := m.mountsUnder(ctx, chrootRoot); err != nil {
|
|
return fmt.Errorf("re-inspect chroot mounts: %w", err)
|
|
} else if len(remaining) > 0 {
|
|
return fmt.Errorf("refusing to rm -rf %q: still has %d mount(s): %v", chrootRoot, len(remaining), remaining)
|
|
}
|
|
return m.sudo(ctx, "rm", "-rf", "--", chrootRoot)
|
|
}
|
|
|
|
// detachMount tears down a single mount target with MNT_DETACH (lazy) +
|
|
// UMOUNT_NOFOLLOW (refuse symlinks). Falls back to `sudo umount --lazy`
|
|
// when not running as root, since umount2() requires CAP_SYS_ADMIN.
|
|
//
|
|
// ENOENT and EINVAL on the syscall path are treated as "already gone" —
|
|
// findmnt's snapshot can race with parallel cleanups, and a missing
|
|
// mount is the desired end state.
|
|
func (m *Manager) detachMount(ctx context.Context, target string) error {
|
|
if os.Geteuid() == 0 {
|
|
err := unix.Unmount(target, unix.MNT_DETACH|unix.UMOUNT_NOFOLLOW)
|
|
if err == nil || errors.Is(err, unix.ENOENT) || errors.Is(err, unix.EINVAL) {
|
|
return nil
|
|
}
|
|
return err
|
|
}
|
|
// Local-priv fallback: shell `umount --lazy` resolves the path through
|
|
// the kernel without UMOUNT_NOFOLLOW, but the EvalSymlinks check earlier
|
|
// already constrained the chroot tree. The dev-mode caveat in
|
|
// docs/privileges.md covers this branch's looser guarantees.
|
|
_, err := m.runner.RunSudo(ctx, "umount", "--lazy", target)
|
|
return err
|
|
}
|
|
|
|
func (m *Manager) bindFile(ctx context.Context, source, target string, readOnly bool) error {
|
|
if err := m.sudo(ctx, "install", "-D", "-m", "0644", "/dev/null", target); err != nil {
|
|
return fmt.Errorf("create bind target file: %w", err)
|
|
}
|
|
return m.bindMount(ctx, source, target, readOnly)
|
|
}
|
|
|
|
func (m *Manager) bindDir(ctx context.Context, source, target string, readOnly bool) error {
|
|
if err := m.sudo(ctx, "mkdir", "-p", target); err != nil {
|
|
return fmt.Errorf("create bind target dir: %w", err)
|
|
}
|
|
return m.bindMount(ctx, source, target, readOnly)
|
|
}
|
|
|
|
func (m *Manager) bindMount(ctx context.Context, source, target string, readOnly bool) error {
|
|
if err := m.sudo(ctx, "mount", "--bind", source, target); err != nil {
|
|
return err
|
|
}
|
|
if !readOnly {
|
|
return nil
|
|
}
|
|
// Single-step ro bind isn't honored by all kernels — the bind happens
|
|
// rw and the ro flag is silently ignored. Remount makes it stick.
|
|
return m.sudo(ctx, "mount", "-o", "remount,bind,ro", target)
|
|
}
|
|
|
|
// mountsUnder returns the list of mount targets at or under chrootRoot.
|
|
// findmnt's output is one path per line; an empty list means no leftovers.
|
|
func (m *Manager) mountsUnder(ctx context.Context, chrootRoot string) ([]string, error) {
|
|
out, err := m.runner.Run(ctx, "findmnt", "--output", "TARGET", "--list", "--noheadings")
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
var mounts []string
|
|
prefix := chrootRoot + string(os.PathSeparator)
|
|
for _, line := range strings.Split(string(out), "\n") {
|
|
t := strings.TrimSpace(line)
|
|
if t == chrootRoot || strings.HasPrefix(t, prefix) {
|
|
mounts = append(mounts, t)
|
|
}
|
|
}
|
|
return mounts, nil
|
|
}
|
|
|
|
func (m *Manager) stageDrive(ctx context.Context, chrootRoot string, d ChrootDriveSpec, uid, gid int) error {
|
|
info, err := os.Stat(d.HostPath)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if info.Mode()&os.ModeDevice != 0 {
|
|
stat, ok := info.Sys().(*syscall.Stat_t)
|
|
if !ok {
|
|
return fmt.Errorf("stat %s: cannot read device numbers", d.HostPath)
|
|
}
|
|
major := unix.Major(stat.Rdev)
|
|
minor := unix.Minor(stat.Rdev)
|
|
return m.mknodBlock(ctx, chrootRoot, d.ChrootName, major, minor, uid, gid)
|
|
}
|
|
return m.linkInto(ctx, chrootRoot, d.HostPath, d.ChrootName, uid, gid)
|
|
}
|
|
|
|
func (m *Manager) stageDevice(ctx context.Context, chrootRoot, chrootName, hostDevice string, uid, gid int) error {
|
|
info, err := os.Stat(hostDevice)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
stat, ok := info.Sys().(*syscall.Stat_t)
|
|
if !ok {
|
|
return fmt.Errorf("stat %s: cannot read device numbers", hostDevice)
|
|
}
|
|
major := unix.Major(stat.Rdev)
|
|
minor := unix.Minor(stat.Rdev)
|
|
target := chrootRoot + "/" + chrootName
|
|
if err := m.sudo(ctx, "mknod", "-m", "0660", target, "c", strconv.FormatUint(uint64(major), 10), strconv.FormatUint(uint64(minor), 10)); err != nil {
|
|
return err
|
|
}
|
|
return m.chown(ctx, target, uid, gid)
|
|
}
|
|
|
|
func (m *Manager) mknodBlock(ctx context.Context, chrootRoot, name string, major, minor uint32, uid, gid int) error {
|
|
target := chrootRoot + "/" + name
|
|
if err := m.sudo(ctx, "mknod", "-m", "0660", target, "b", strconv.FormatUint(uint64(major), 10), strconv.FormatUint(uint64(minor), 10)); err != nil {
|
|
return err
|
|
}
|
|
return m.chown(ctx, target, uid, gid)
|
|
}
|
|
|
|
func (m *Manager) linkInto(ctx context.Context, chrootRoot, source, name string, uid, gid int) error {
|
|
target := chrootRoot + "/" + name
|
|
if err := m.sudo(ctx, "ln", "-f", source, target); err != nil {
|
|
return err
|
|
}
|
|
return m.chown(ctx, target, uid, gid)
|
|
}
|
|
|
|
func (m *Manager) chown(ctx context.Context, target string, uid, gid int) error {
|
|
return m.sudo(ctx, "chown", fmt.Sprintf("%d:%d", uid, gid), target)
|
|
}
|
|
|
|
func (m *Manager) sudo(ctx context.Context, name string, args ...string) error {
|
|
if os.Geteuid() == 0 {
|
|
_, err := m.runner.Run(ctx, name, args...)
|
|
return err
|
|
}
|
|
_, err := m.runner.RunSudo(ctx, append([]string{name}, args...)...)
|
|
return err
|
|
}
|
|
|
|
func waitForPath(ctx context.Context, path string, timeout time.Duration, label string) error {
|
|
return pollPath(ctx, path, timeout, 100*time.Millisecond, label)
|
|
}
|
|
|
|
func pollPath(ctx context.Context, path string, timeout, interval time.Duration, label string) error {
|
|
deadline := time.Now().Add(timeout)
|
|
for {
|
|
if _, err := os.Stat(path); err == nil {
|
|
return nil
|
|
} else if err != nil && !os.IsNotExist(err) {
|
|
return err
|
|
}
|
|
if time.Now().After(deadline) {
|
|
return fmt.Errorf("%s not ready: %s: %w", label, path, context.DeadlineExceeded)
|
|
}
|
|
select {
|
|
case <-ctx.Done():
|
|
return ctx.Err()
|
|
case <-time.After(interval):
|
|
}
|
|
}
|
|
}
|