firecracker: adopt firecracker-jailer for VM launch (Phase B)
Each VM's firecracker now runs inside a per-VM chroot dropped to the registered owner UID via firecracker-jailer. Closes the broad ambient- sudo escalation surface that survived Phase A: the helper still needs caps for tap/bridge/dm/loop/iptables, but the VMM itself no longer runs as root in the host root filesystem. The host helper stages each chroot up front: hard-links the kernel and (optional) initrd, mknods block-device drives + /dev/vhost-vsock, copies in the firecracker binary (jailer opens it O_RDWR so a ro bind fails with EROFS), and bind-mounts /usr/lib + /lib trees read-only so the dynamic linker can resolve. Self-binds the chroot first so the findmnt-guarded cleanup can recurse safely. AF_UNIX sun_path is 108 bytes; the chroot path easily blows past that. Daemon-side launch pre-symlinks the short request socket path to the long chroot socket before Machine.Start so the SDK's poll/connect sees the short path while the kernel resolves to the chroot socket. --new-pid-ns is intentionally disabled — jailer's PID-namespace fork makes the SDK see the parent exit and tear the API socket down too early. CapabilityBoundingSet for the helper expands to add CAP_FOWNER, CAP_KILL, CAP_MKNOD, CAP_SETGID, CAP_SETUID, CAP_SYS_CHROOT alongside the existing CAP_CHOWN/CAP_DAC_OVERRIDE/CAP_NET_ADMIN/CAP_NET_RAW/ CAP_SYS_ADMIN. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
d73efe6fbc
commit
6b543cb17f
12 changed files with 864 additions and 56 deletions
|
|
@ -6,6 +6,8 @@ import (
|
|||
"log/slog"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
|
|
@ -32,8 +34,34 @@ type MachineConfig struct {
|
|||
VCPUCount int
|
||||
MemoryMiB int
|
||||
Logger *slog.Logger
|
||||
// Jailer, when non-nil, wraps firecracker in `jailer`. Path fields
|
||||
// (SocketPath, KernelImagePath, InitrdPath, Drives[].Path, VSockPath)
|
||||
// MUST be pre-translated by the caller: SocketPath/VSockPath as
|
||||
// host-visible chroot paths; the rest as chroot-internal paths
|
||||
// (jailer chroots before exec, so firecracker resolves them inside
|
||||
// the chroot).
|
||||
Jailer *JailerOpts
|
||||
}
|
||||
|
||||
// JailerOpts captures the jailer-specific knobs. The chroot tree at
|
||||
// `<ChrootBaseDir>/firecracker/<VMID>/root/` and the kernel/drive nodes
|
||||
// inside it must be staged by the caller before NewMachine — this
|
||||
// package only constructs the launch cmd.
|
||||
type JailerOpts struct {
|
||||
Binary string
|
||||
ChrootBaseDir string
|
||||
UID int
|
||||
GID int
|
||||
}
|
||||
|
||||
// JailerSocketName is the chroot-relative API socket path passed to
|
||||
// firecracker via --api-sock. Lives at the chroot root (no /run/ subdir
|
||||
// required) so we don't depend on jailer creating intermediate dirs.
|
||||
const JailerSocketName = "/firecracker.socket"
|
||||
|
||||
// JailerVSockName mirrors JailerSocketName for the vsock UDS.
|
||||
const JailerVSockName = "/vsock.sock"
|
||||
|
||||
type DriveConfig struct {
|
||||
ID string
|
||||
Path string
|
||||
|
|
@ -74,6 +102,13 @@ func NewMachine(ctx context.Context, cfg MachineConfig) (*Machine, error) {
|
|||
return &Machine{machine: machine, logFile: logFile}, nil
|
||||
}
|
||||
|
||||
// JailerChrootRoot returns the host-visible path to the jailer chroot
|
||||
// root for vmid under base. Mirrors the layout firecracker's jailer
|
||||
// builds: <base>/firecracker/<vmid>/root.
|
||||
func JailerChrootRoot(base, vmid string) string {
|
||||
return filepath.Join(base, "firecracker", vmid, "root")
|
||||
}
|
||||
|
||||
func (m *Machine) Start(ctx context.Context) error {
|
||||
// The caller's ctx is INTENTIONALLY not forwarded to the SDK.
|
||||
// firecracker-go-sdk's startVMM (machine.go) spawns a goroutine
|
||||
|
|
@ -141,7 +176,7 @@ func buildConfig(cfg MachineConfig) sdk.Config {
|
|||
}
|
||||
drives := drivesBuilder.Build()
|
||||
|
||||
return sdk.Config{
|
||||
out := sdk.Config{
|
||||
SocketPath: cfg.SocketPath,
|
||||
LogPath: cfg.LogPath,
|
||||
MetricsPath: cfg.MetricsPath,
|
||||
|
|
@ -162,6 +197,18 @@ func buildConfig(cfg MachineConfig) sdk.Config {
|
|||
},
|
||||
VMID: cfg.VMID,
|
||||
}
|
||||
if cfg.Jailer != nil {
|
||||
// The path fields above are already chroot-translated by the
|
||||
// caller (see MachineConfig.Jailer doc). Skip the SDK's host-side
|
||||
// existence checks — kernel/drives live inside the chroot, not
|
||||
// at the paths we report.
|
||||
out.DisableValidation = true
|
||||
// LogPath is the host-side file used only for cmd.Stderr capture.
|
||||
// Clearing it here prevents the SDK from sending PUT /logger with
|
||||
// a host path that firecracker can't open from inside the chroot.
|
||||
out.LogPath = ""
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func buildVsockDevices(cfg MachineConfig) []sdk.VsockDevice {
|
||||
|
|
@ -214,13 +261,26 @@ func defaultDriveID(drive DriveConfig, fallback string) string {
|
|||
// fcproc.EnsureSocketAccessForAsync immediately *before* Machine.Start so the
|
||||
// chown wins the race against the SDK's HTTP probe over the API socket. That
|
||||
// replaces the previous in-shell chown_watcher.
|
||||
//
|
||||
// When cfg.Jailer is set, the launch is wrapped by `jailer`. The chroot tree
|
||||
// MUST already be staged (kernel hard-linked, drives mknod'd, dirs chowned to
|
||||
// the configured UID:GID) — see fcproc.PrepareJailerChroot. The SDK's own
|
||||
// JailerCfg path is intentionally bypassed: it cannot mknod block devices and
|
||||
// does not expose --new-pid-ns.
|
||||
func buildProcessRunner(cfg MachineConfig, logFile *os.File) *exec.Cmd {
|
||||
args := []string{"--api-sock", cfg.SocketPath, "--id", cfg.VMID}
|
||||
var bin string
|
||||
var args []string
|
||||
if cfg.Jailer != nil {
|
||||
bin, args = jailerArgs(cfg)
|
||||
} else {
|
||||
bin = cfg.BinaryPath
|
||||
args = []string{"--api-sock", cfg.SocketPath, "--id", cfg.VMID}
|
||||
}
|
||||
var cmd *exec.Cmd
|
||||
if os.Geteuid() == 0 {
|
||||
cmd = exec.Command(cfg.BinaryPath, args...)
|
||||
cmd = exec.Command(bin, args...)
|
||||
} else {
|
||||
cmd = exec.Command("sudo", append([]string{"-n", "-E", cfg.BinaryPath}, args...)...)
|
||||
cmd = exec.Command("sudo", append([]string{"-n", "-E", bin}, args...)...)
|
||||
}
|
||||
cmd.Stdin = nil
|
||||
if logFile != nil {
|
||||
|
|
@ -230,6 +290,28 @@ func buildProcessRunner(cfg MachineConfig, logFile *os.File) *exec.Cmd {
|
|||
return cmd
|
||||
}
|
||||
|
||||
// jailerArgs returns the (binary, args) tuple for the jailer wrapper.
|
||||
// firecracker's flags are passed after `--`. --new-pid-ns gives the guest
|
||||
// VMM its own PID namespace; the SDK's JailerCommandBuilder doesn't expose
|
||||
// it in v1.0.0, which is the main reason this path doesn't go through
|
||||
// sdk.Config.JailerCfg.
|
||||
func jailerArgs(cfg MachineConfig) (string, []string) {
|
||||
args := []string{
|
||||
"--id", cfg.VMID,
|
||||
"--uid", strconv.Itoa(cfg.Jailer.UID),
|
||||
"--gid", strconv.Itoa(cfg.Jailer.GID),
|
||||
"--exec-file", cfg.BinaryPath,
|
||||
"--chroot-base-dir", cfg.Jailer.ChrootBaseDir,
|
||||
// "--new-pid-ns": jailer forks when creating the PID namespace; the
|
||||
// SDK tracks the parent's PID, which exits immediately, causing the
|
||||
// SDK's "process exited" goroutine to tear down the API socket while
|
||||
// firecracker is still booting in the child. Left out intentionally.
|
||||
"--",
|
||||
"--api-sock", JailerSocketName,
|
||||
}
|
||||
return cfg.Jailer.Binary, args
|
||||
}
|
||||
|
||||
func newLogger(base *slog.Logger) *logrus.Entry {
|
||||
logger := logrus.New()
|
||||
logger.SetOutput(io.Discard)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue