roothelper: tighten input validation across privileged RPCs
Defence-in-depth pass over every helper method that touches the host
as root. Each fix narrows what a compromised owner-uid daemon could
ask the helper to do; many close concrete file-ownership and DoS
primitives that the previous validators didn't reach.
Path / identifier validation:
* priv.fsck_snapshot now requires /dev/mapper/fc-rootfs-* (was
"is the string non-empty"). e2fsck -fy on /dev/sda1 was the
motivating exploit.
* priv.kill_process and priv.signal_process now read
/proc/<pid>/cmdline and require a "firecracker" substring before
sending the signal. Killing arbitrary host PIDs (sshd, init, …)
is no longer a one-RPC primitive.
* priv.read_ext4_file and priv.write_ext4_files now require the
image path to live under StateDir or be /dev/mapper/fc-rootfs-*.
* priv.cleanup_dm_snapshot validates every non-empty Handles field:
DM name fc-rootfs-*, DM device /dev/mapper/fc-rootfs-*, loops
/dev/loopN.
* priv.remove_dm_snapshot accepts only fc-rootfs-* names or
/dev/mapper/fc-rootfs-* paths.
* priv.ensure_nat now requires a parsable IPv4 address and a
banger-prefixed tap.
* priv.sync_resolver_routing and priv.clear_resolver_routing now
require a Linux iface-name-shaped bridge name (1–15 chars, no
whitespace/'/'/':') and, for sync, a parsable resolver address.
Symlink defence:
* priv.ensure_socket_access now validates the socket path is under
RuntimeDir and not a symlink. The fcproc layer's chown/chmod
moves to unix.Open(O_PATH|O_NOFOLLOW) + Fchownat(AT_EMPTY_PATH)
+ Fchmodat via /proc/self/fd, so even a swap of the leaf into a
symlink between validation and the syscall is refused. The
local-priv (non-root) fallback uses `chown -h`.
* priv.cleanup_jailer_chroot rejects symlinks at both the leaf
(os.Lstat) and intermediate path components (filepath.EvalSymlinks
+ clean-equality). The umount sweep was rewritten from shell
`umount --recursive --lazy` to direct unix.Unmount(MNT_DETACH |
UMOUNT_NOFOLLOW) per child mount, deepest-first; the findmnt
guard remains as the rm-rf safety net. Local-priv mode falls
back to `sudo umount --lazy`.
Binary validation:
* validateRootExecutable now opens with O_PATH|O_NOFOLLOW and
Fstats through the resulting fd. Rejects path-level symlinks and
narrows the TOCTOU window between validation and the SDK's exec
to fork+exec time on a healthy host.
Daemon socket:
* The owner daemon now reads SO_PEERCRED on every accepted
connection and refuses any UID that isn't 0 or the registered
owner. Filesystem perms (0600 + ownerUID) already enforced this;
the check is belt-and-braces in case the socket FD is ever
leaked to a non-owner process.
Docs:
* docs/privileges.md walked end-to-end. Each helper RPC's
Validation gate row reflects what the code actually enforces.
New section "Running outside the system install" calls out the
looser dev-mode trust model (NOPASSWD sudoers, helper hardening
bypassed) so users don't deploy that path on shared hosts.
Trust list updated to include every new validator.
Tests added: validators (DM-loop, DM-remove-target, DM-handles,
ext4-image-path, iface-name, IPv4, resolver-addr, not-symlink,
firecracker-PID, root-executable variants), the daemon's authorize
path (non-unix conn rejection + unix conn happy path), the umount2
ordering contract (deepest-first + --lazy on the sudo branch), and
positive/negative cases for the chown-no-follow fallback.
Verified end-to-end via `make smoke JOBS=4` on a KVM host.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
6b543cb17f
commit
853249dec2
8 changed files with 1177 additions and 63 deletions
|
|
@ -12,6 +12,7 @@ import (
|
|||
"log/slog"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
|
|
@ -202,18 +203,57 @@ func (m *Manager) ensureSocketAccessFor(ctx context.Context, socketPath, label s
|
|||
if err := pollPath(ctx, socketPath, timeout, interval, label); err != nil {
|
||||
return err
|
||||
}
|
||||
if os.Geteuid() == 0 {
|
||||
if _, err := m.runner.Run(ctx, "chmod", "600", socketPath); err != nil {
|
||||
return chownChmodNoFollow(ctx, m.runner, socketPath, uid, gid, 0o600)
|
||||
}
|
||||
|
||||
// chownChmodNoFollow sets owner/group/mode on path without following
|
||||
// symlinks at the leaf. Required because the helper RPCs that drive
|
||||
// socket access run as root: a follow-symlink chmod/chown becomes an
|
||||
// arbitrary file-ownership primitive if the caller can plant a symlink
|
||||
// at the target.
|
||||
//
|
||||
// Linux idiom: open with O_PATH|O_NOFOLLOW (errors out if the leaf is a
|
||||
// symlink), Fstat the fd to confirm the file is a unix socket, then
|
||||
// chown via Fchownat(AT_EMPTY_PATH) and chmod via /proc/self/fd/N
|
||||
// (fchmod on an O_PATH fd returns EBADF, but the /proc path resolves
|
||||
// straight back to the inode the fd already pins, so no leaf re-traversal
|
||||
// happens).
|
||||
//
|
||||
// Falls back to `sudo chown -h` + `sudo chmod` for the local-priv mode
|
||||
// where the daemon isn't root and can't issue the syscalls itself; the
|
||||
// `-h` flag still avoids the symlink-follow on the chown side.
|
||||
func chownChmodNoFollow(ctx context.Context, runner Runner, path string, uid, gid int, mode os.FileMode) error {
|
||||
if os.Geteuid() != 0 {
|
||||
// Mode-then-owner ordering preserves the pre-existing failure
|
||||
// semantics of the legacy `chmod 600 / chown` shell-out path
|
||||
// (chmod-failure tests expect chown to be skipped). `chown -h`
|
||||
// keeps the symlink-no-follow guarantee on this branch.
|
||||
if _, err := runner.RunSudo(ctx, "chmod", fmt.Sprintf("%o", mode.Perm()), path); err != nil {
|
||||
return err
|
||||
}
|
||||
_, err := m.runner.Run(ctx, "chown", fmt.Sprintf("%d:%d", uid, gid), socketPath)
|
||||
_, err := runner.RunSudo(ctx, "chown", "-h", fmt.Sprintf("%d:%d", uid, gid), path)
|
||||
return err
|
||||
}
|
||||
if _, err := m.runner.RunSudo(ctx, "chmod", "600", socketPath); err != nil {
|
||||
return err
|
||||
fd, err := unix.Open(path, unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0)
|
||||
if err != nil {
|
||||
return fmt.Errorf("open %s: %w", path, err)
|
||||
}
|
||||
_, err := m.runner.RunSudo(ctx, "chown", fmt.Sprintf("%d:%d", uid, gid), socketPath)
|
||||
return err
|
||||
defer unix.Close(fd)
|
||||
var st unix.Stat_t
|
||||
if err := unix.Fstat(fd, &st); err != nil {
|
||||
return fmt.Errorf("fstat %s: %w", path, err)
|
||||
}
|
||||
if st.Mode&unix.S_IFMT != unix.S_IFSOCK {
|
||||
return fmt.Errorf("%s is not a unix socket (mode %#o)", path, st.Mode&unix.S_IFMT)
|
||||
}
|
||||
procPath := "/proc/self/fd/" + strconv.Itoa(fd)
|
||||
if err := unix.Fchmodat(unix.AT_FDCWD, procPath, uint32(mode.Perm()), 0); err != nil {
|
||||
return fmt.Errorf("chmod %s: %w", path, err)
|
||||
}
|
||||
if err := unix.Fchownat(fd, "", uid, gid, unix.AT_EMPTY_PATH); err != nil {
|
||||
return fmt.Errorf("chown %s: %w", path, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// FindPID returns the PID of the firecracker process listening on apiSock,
|
||||
|
|
@ -447,23 +487,84 @@ func (m *Manager) CleanupJailerChroot(ctx context.Context, chrootRoot string) er
|
|||
if strings.TrimSpace(chrootRoot) == "" {
|
||||
return nil
|
||||
}
|
||||
if _, err := os.Stat(chrootRoot); os.IsNotExist(err) {
|
||||
return nil
|
||||
// Lstat (not Stat): if chrootRoot is a symlink the umount/rm shell-outs
|
||||
// below would chase it. The handler-side validateNotSymlink also catches
|
||||
// this, but lifting the check inside fcproc closes the TOCTOU window
|
||||
// between the handler check and our umount command.
|
||||
info, err := os.Lstat(chrootRoot)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return nil
|
||||
}
|
||||
return fmt.Errorf("inspect chroot %s: %w", chrootRoot, err)
|
||||
}
|
||||
// Best-effort umount: for chroots that were never bind-mounted (a
|
||||
// stale install pre-bind-mount work, say) this fails — that's fine,
|
||||
// the findmnt guard below is what enforces safety.
|
||||
_ = m.sudoIgnore(ctx, "umount", "--recursive", "--lazy", chrootRoot)
|
||||
if mounts, err := m.mountsUnder(ctx, chrootRoot); err != nil {
|
||||
if info.Mode()&os.ModeSymlink != 0 {
|
||||
return fmt.Errorf("refusing to clean up %q: path is a symlink", chrootRoot)
|
||||
}
|
||||
if !info.IsDir() {
|
||||
return fmt.Errorf("refusing to clean up %q: not a directory", chrootRoot)
|
||||
}
|
||||
// Resolve any intermediate symlinks and require the result equals the
|
||||
// input — that catches a planted `…/jail/firecracker/<vmid> → /` even
|
||||
// though the leaf "/root" component is itself a real directory inside
|
||||
// the redirected target. Equality + Lstat together cover both top and
|
||||
// intermediate symlink shapes.
|
||||
resolved, err := filepath.EvalSymlinks(chrootRoot)
|
||||
if err != nil {
|
||||
return fmt.Errorf("resolve chroot %s: %w", chrootRoot, err)
|
||||
}
|
||||
if filepath.Clean(resolved) != filepath.Clean(chrootRoot) {
|
||||
return fmt.Errorf("refusing to clean up %q: resolves to %q via symlink", chrootRoot, resolved)
|
||||
}
|
||||
// Switch from `umount --recursive --lazy <chrootRoot>` (shell-resolved,
|
||||
// follows symlinks at exec time) to direct umount2() syscalls per child
|
||||
// mount with UMOUNT_NOFOLLOW. That fully closes the residual TOCTOU
|
||||
// between the EvalSymlinks check above and the unmount: even if a daemon-
|
||||
// uid attacker swapped a child mount's path to a symlink in the gap, the
|
||||
// kernel refuses to follow it. The findmnt guard below still catches any
|
||||
// mount we couldn't detach.
|
||||
mounts, err := m.mountsUnder(ctx, chrootRoot)
|
||||
if err != nil {
|
||||
return fmt.Errorf("inspect chroot mounts: %w", err)
|
||||
} else if len(mounts) > 0 {
|
||||
return fmt.Errorf("refusing to rm -rf %q: still has %d mount(s): %v", chrootRoot, len(mounts), mounts)
|
||||
}
|
||||
// Deepest-first so child mounts come off before parents; otherwise a
|
||||
// parent unmount would EBUSY against in-use children.
|
||||
sort.Slice(mounts, func(i, j int) bool {
|
||||
return strings.Count(mounts[i], "/") > strings.Count(mounts[j], "/")
|
||||
})
|
||||
for _, mt := range mounts {
|
||||
if err := m.detachMount(ctx, mt); err != nil {
|
||||
return fmt.Errorf("detach %q: %w", mt, err)
|
||||
}
|
||||
}
|
||||
if remaining, err := m.mountsUnder(ctx, chrootRoot); err != nil {
|
||||
return fmt.Errorf("re-inspect chroot mounts: %w", err)
|
||||
} else if len(remaining) > 0 {
|
||||
return fmt.Errorf("refusing to rm -rf %q: still has %d mount(s): %v", chrootRoot, len(remaining), remaining)
|
||||
}
|
||||
return m.sudo(ctx, "rm", "-rf", "--", chrootRoot)
|
||||
}
|
||||
|
||||
func (m *Manager) sudoIgnore(ctx context.Context, name string, args ...string) error {
|
||||
err := m.sudo(ctx, name, args...)
|
||||
// detachMount tears down a single mount target with MNT_DETACH (lazy) +
|
||||
// UMOUNT_NOFOLLOW (refuse symlinks). Falls back to `sudo umount --lazy`
|
||||
// when not running as root, since umount2() requires CAP_SYS_ADMIN.
|
||||
//
|
||||
// ENOENT and EINVAL on the syscall path are treated as "already gone" —
|
||||
// findmnt's snapshot can race with parallel cleanups, and a missing
|
||||
// mount is the desired end state.
|
||||
func (m *Manager) detachMount(ctx context.Context, target string) error {
|
||||
if os.Geteuid() == 0 {
|
||||
err := unix.Unmount(target, unix.MNT_DETACH|unix.UMOUNT_NOFOLLOW)
|
||||
if err == nil || errors.Is(err, unix.ENOENT) || errors.Is(err, unix.EINVAL) {
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
// Local-priv fallback: shell `umount --lazy` resolves the path through
|
||||
// the kernel without UMOUNT_NOFOLLOW, but the EvalSymlinks check earlier
|
||||
// already constrained the chroot tree. The dev-mode caveat in
|
||||
// docs/privileges.md covers this branch's looser guarantees.
|
||||
_, err := m.runner.RunSudo(ctx, "umount", "--lazy", target)
|
||||
return err
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue