Each VM's firecracker now runs inside a per-VM chroot dropped to the registered owner UID via firecracker-jailer. Closes the broad ambient- sudo escalation surface that survived Phase A: the helper still needs caps for tap/bridge/dm/loop/iptables, but the VMM itself no longer runs as root in the host root filesystem. The host helper stages each chroot up front: hard-links the kernel and (optional) initrd, mknods block-device drives + /dev/vhost-vsock, copies in the firecracker binary (jailer opens it O_RDWR so a ro bind fails with EROFS), and bind-mounts /usr/lib + /lib trees read-only so the dynamic linker can resolve. Self-binds the chroot first so the findmnt-guarded cleanup can recurse safely. AF_UNIX sun_path is 108 bytes; the chroot path easily blows past that. Daemon-side launch pre-symlinks the short request socket path to the long chroot socket before Machine.Start so the SDK's poll/connect sees the short path while the kernel resolves to the chroot socket. --new-pid-ns is intentionally disabled — jailer's PID-namespace fork makes the SDK see the parent exit and tear the API socket down too early. CapabilityBoundingSet for the helper expands to add CAP_FOWNER, CAP_KILL, CAP_MKNOD, CAP_SETGID, CAP_SETUID, CAP_SYS_CHROOT alongside the existing CAP_CHOWN/CAP_DAC_OVERRIDE/CAP_NET_ADMIN/CAP_NET_RAW/ CAP_SYS_ADMIN. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
367 lines
11 KiB
Go
367 lines
11 KiB
Go
package firecracker
|
|
|
|
import (
|
|
"context"
|
|
"io"
|
|
"log/slog"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
|
|
sdk "github.com/firecracker-microvm/firecracker-go-sdk"
|
|
models "github.com/firecracker-microvm/firecracker-go-sdk/client/models"
|
|
"github.com/sirupsen/logrus"
|
|
|
|
"banger/internal/vsockagent"
|
|
)
|
|
|
|
type MachineConfig struct {
|
|
BinaryPath string
|
|
VMID string
|
|
SocketPath string
|
|
LogPath string
|
|
MetricsPath string
|
|
KernelImagePath string
|
|
InitrdPath string
|
|
KernelArgs string
|
|
Drives []DriveConfig
|
|
TapDevice string
|
|
VSockPath string
|
|
VSockCID uint32
|
|
VCPUCount int
|
|
MemoryMiB int
|
|
Logger *slog.Logger
|
|
// Jailer, when non-nil, wraps firecracker in `jailer`. Path fields
|
|
// (SocketPath, KernelImagePath, InitrdPath, Drives[].Path, VSockPath)
|
|
// MUST be pre-translated by the caller: SocketPath/VSockPath as
|
|
// host-visible chroot paths; the rest as chroot-internal paths
|
|
// (jailer chroots before exec, so firecracker resolves them inside
|
|
// the chroot).
|
|
Jailer *JailerOpts
|
|
}
|
|
|
|
// JailerOpts captures the jailer-specific knobs. The chroot tree at
|
|
// `<ChrootBaseDir>/firecracker/<VMID>/root/` and the kernel/drive nodes
|
|
// inside it must be staged by the caller before NewMachine — this
|
|
// package only constructs the launch cmd.
|
|
type JailerOpts struct {
|
|
Binary string
|
|
ChrootBaseDir string
|
|
UID int
|
|
GID int
|
|
}
|
|
|
|
// JailerSocketName is the chroot-relative API socket path passed to
|
|
// firecracker via --api-sock. Lives at the chroot root (no /run/ subdir
|
|
// required) so we don't depend on jailer creating intermediate dirs.
|
|
const JailerSocketName = "/firecracker.socket"
|
|
|
|
// JailerVSockName mirrors JailerSocketName for the vsock UDS.
|
|
const JailerVSockName = "/vsock.sock"
|
|
|
|
type DriveConfig struct {
|
|
ID string
|
|
Path string
|
|
ReadOnly bool
|
|
IsRoot bool
|
|
}
|
|
|
|
type Machine struct {
|
|
machine *sdk.Machine
|
|
logFile *os.File
|
|
closeOnce sync.Once
|
|
}
|
|
|
|
type Client struct {
|
|
client *sdk.Client
|
|
}
|
|
|
|
func NewMachine(ctx context.Context, cfg MachineConfig) (*Machine, error) {
|
|
logFile, err := openLogFile(cfg.LogPath)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
cmd := buildProcessRunner(cfg, logFile)
|
|
machine, err := sdk.NewMachine(
|
|
ctx,
|
|
buildConfig(cfg),
|
|
sdk.WithProcessRunner(cmd),
|
|
sdk.WithLogger(newLogger(cfg.Logger)),
|
|
)
|
|
if err != nil {
|
|
if logFile != nil {
|
|
_ = logFile.Close()
|
|
}
|
|
return nil, err
|
|
}
|
|
|
|
return &Machine{machine: machine, logFile: logFile}, nil
|
|
}
|
|
|
|
// JailerChrootRoot returns the host-visible path to the jailer chroot
|
|
// root for vmid under base. Mirrors the layout firecracker's jailer
|
|
// builds: <base>/firecracker/<vmid>/root.
|
|
func JailerChrootRoot(base, vmid string) string {
|
|
return filepath.Join(base, "firecracker", vmid, "root")
|
|
}
|
|
|
|
func (m *Machine) Start(ctx context.Context) error {
|
|
// The caller's ctx is INTENTIONALLY not forwarded to the SDK.
|
|
// firecracker-go-sdk's startVMM (machine.go) spawns a goroutine
|
|
// that SIGTERMs firecracker the instant this ctx cancels, and
|
|
// retains it for the lifetime of the VMM — not just the boot
|
|
// phase. Plumbing an RPC request ctx through would mean
|
|
// firecracker dies the moment the daemon writes its RPC response
|
|
// (daemon.go:handleConn defers cancel). That silently breaks
|
|
// `vm start` on a stopped VM: start "succeeds", the handler
|
|
// returns, ctx cancels, firecracker is SIGTERMed, and the next
|
|
// `vm ssh` hits `vmAlive = false`. `vm.create` sidesteps the bug
|
|
// because BeginVMCreate detaches to a background ctx before
|
|
// calling startVMLocked.
|
|
//
|
|
// We own firecracker lifecycle explicitly — StopVM / KillVM /
|
|
// cleanupRuntime — so losing ctx-driven cancellation here is
|
|
// deliberate. The SDK still enforces its own boot-phase timeouts
|
|
// (socket wait, HTTP) with internal deadlines.
|
|
if err := m.machine.Start(context.Background()); err != nil {
|
|
m.closeLog()
|
|
return err
|
|
}
|
|
_ = ctx
|
|
|
|
go func() {
|
|
_ = m.machine.Wait(context.Background())
|
|
m.closeLog()
|
|
}()
|
|
|
|
return nil
|
|
}
|
|
|
|
func (m *Machine) PID() (int, error) {
|
|
return m.machine.PID()
|
|
}
|
|
|
|
func New(apiSock string, logger *slog.Logger) *Client {
|
|
return &Client{client: sdk.NewClient(apiSock, newLogger(logger), false)}
|
|
}
|
|
|
|
func (c *Client) SendCtrlAltDel(ctx context.Context) error {
|
|
action := models.InstanceActionInfoActionTypeSendCtrlAltDel
|
|
_, err := c.client.CreateSyncAction(ctx, &models.InstanceActionInfo{
|
|
ActionType: &action,
|
|
})
|
|
return err
|
|
}
|
|
|
|
func openLogFile(path string) (*os.File, error) {
|
|
if path == "" {
|
|
return nil, nil
|
|
}
|
|
return os.OpenFile(path, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0o644)
|
|
}
|
|
|
|
func buildConfig(cfg MachineConfig) sdk.Config {
|
|
rootDrive, extraDrives := splitDrives(cfg.Drives)
|
|
drivesBuilder := sdk.NewDrivesBuilder(rootDrive.Path).
|
|
WithRootDrive(rootDrive.Path, sdk.WithDriveID(defaultDriveID(rootDrive, "rootfs")), sdk.WithReadOnly(rootDrive.ReadOnly))
|
|
for _, drive := range extraDrives {
|
|
if strings.TrimSpace(drive.Path) == "" {
|
|
continue
|
|
}
|
|
drivesBuilder = drivesBuilder.AddDrive(drive.Path, drive.ReadOnly, sdk.WithDriveID(defaultDriveID(drive, "drive")))
|
|
}
|
|
drives := drivesBuilder.Build()
|
|
|
|
out := sdk.Config{
|
|
SocketPath: cfg.SocketPath,
|
|
LogPath: cfg.LogPath,
|
|
MetricsPath: cfg.MetricsPath,
|
|
KernelImagePath: cfg.KernelImagePath,
|
|
InitrdPath: cfg.InitrdPath,
|
|
KernelArgs: cfg.KernelArgs,
|
|
Drives: drives,
|
|
NetworkInterfaces: sdk.NetworkInterfaces{{
|
|
StaticConfiguration: &sdk.StaticNetworkConfiguration{
|
|
HostDevName: cfg.TapDevice,
|
|
},
|
|
}},
|
|
VsockDevices: buildVsockDevices(cfg),
|
|
MachineCfg: models.MachineConfiguration{
|
|
VcpuCount: sdk.Int64(int64(cfg.VCPUCount)),
|
|
MemSizeMib: sdk.Int64(int64(cfg.MemoryMiB)),
|
|
Smt: sdk.Bool(false),
|
|
},
|
|
VMID: cfg.VMID,
|
|
}
|
|
if cfg.Jailer != nil {
|
|
// The path fields above are already chroot-translated by the
|
|
// caller (see MachineConfig.Jailer doc). Skip the SDK's host-side
|
|
// existence checks — kernel/drives live inside the chroot, not
|
|
// at the paths we report.
|
|
out.DisableValidation = true
|
|
// LogPath is the host-side file used only for cmd.Stderr capture.
|
|
// Clearing it here prevents the SDK from sending PUT /logger with
|
|
// a host path that firecracker can't open from inside the chroot.
|
|
out.LogPath = ""
|
|
}
|
|
return out
|
|
}
|
|
|
|
func buildVsockDevices(cfg MachineConfig) []sdk.VsockDevice {
|
|
if strings.TrimSpace(cfg.VSockPath) == "" || cfg.VSockCID == 0 {
|
|
return nil
|
|
}
|
|
return []sdk.VsockDevice{{
|
|
ID: "vsock",
|
|
Path: cfg.VSockPath,
|
|
CID: cfg.VSockCID,
|
|
}}
|
|
}
|
|
|
|
func splitDrives(drives []DriveConfig) (DriveConfig, []DriveConfig) {
|
|
root := DriveConfig{ID: "rootfs"}
|
|
var extras []DriveConfig
|
|
for _, drive := range drives {
|
|
if strings.TrimSpace(drive.Path) == "" {
|
|
continue
|
|
}
|
|
if drive.IsRoot {
|
|
root = drive
|
|
if root.ID == "" {
|
|
root.ID = "rootfs"
|
|
}
|
|
continue
|
|
}
|
|
extras = append(extras, drive)
|
|
}
|
|
return root, extras
|
|
}
|
|
|
|
func defaultDriveID(drive DriveConfig, fallback string) string {
|
|
if strings.TrimSpace(drive.ID) != "" {
|
|
return drive.ID
|
|
}
|
|
return fallback
|
|
}
|
|
|
|
// buildProcessRunner constructs the *exec.Cmd the SDK will start. Args are
|
|
// passed directly — no shell, no string interpolation — so any future change
|
|
// to MachineConfig fields can't smuggle shell metacharacters into the launch.
|
|
//
|
|
// The daemon and root-helper processes set umask 077 at startup, so the
|
|
// API/vsock sockets firecracker creates inherit 0600 mode without needing a
|
|
// shell-level `umask` wrapper.
|
|
//
|
|
// When firecracker has to be launched under sudo (non-root daemon), the
|
|
// resulting sockets are root-owned. The caller (LaunchFirecracker) kicks off
|
|
// fcproc.EnsureSocketAccessForAsync immediately *before* Machine.Start so the
|
|
// chown wins the race against the SDK's HTTP probe over the API socket. That
|
|
// replaces the previous in-shell chown_watcher.
|
|
//
|
|
// When cfg.Jailer is set, the launch is wrapped by `jailer`. The chroot tree
|
|
// MUST already be staged (kernel hard-linked, drives mknod'd, dirs chowned to
|
|
// the configured UID:GID) — see fcproc.PrepareJailerChroot. The SDK's own
|
|
// JailerCfg path is intentionally bypassed: it cannot mknod block devices and
|
|
// does not expose --new-pid-ns.
|
|
func buildProcessRunner(cfg MachineConfig, logFile *os.File) *exec.Cmd {
|
|
var bin string
|
|
var args []string
|
|
if cfg.Jailer != nil {
|
|
bin, args = jailerArgs(cfg)
|
|
} else {
|
|
bin = cfg.BinaryPath
|
|
args = []string{"--api-sock", cfg.SocketPath, "--id", cfg.VMID}
|
|
}
|
|
var cmd *exec.Cmd
|
|
if os.Geteuid() == 0 {
|
|
cmd = exec.Command(bin, args...)
|
|
} else {
|
|
cmd = exec.Command("sudo", append([]string{"-n", "-E", bin}, args...)...)
|
|
}
|
|
cmd.Stdin = nil
|
|
if logFile != nil {
|
|
cmd.Stdout = logFile
|
|
cmd.Stderr = logFile
|
|
}
|
|
return cmd
|
|
}
|
|
|
|
// jailerArgs returns the (binary, args) tuple for the jailer wrapper.
|
|
// firecracker's flags are passed after `--`. --new-pid-ns gives the guest
|
|
// VMM its own PID namespace; the SDK's JailerCommandBuilder doesn't expose
|
|
// it in v1.0.0, which is the main reason this path doesn't go through
|
|
// sdk.Config.JailerCfg.
|
|
func jailerArgs(cfg MachineConfig) (string, []string) {
|
|
args := []string{
|
|
"--id", cfg.VMID,
|
|
"--uid", strconv.Itoa(cfg.Jailer.UID),
|
|
"--gid", strconv.Itoa(cfg.Jailer.GID),
|
|
"--exec-file", cfg.BinaryPath,
|
|
"--chroot-base-dir", cfg.Jailer.ChrootBaseDir,
|
|
// "--new-pid-ns": jailer forks when creating the PID namespace; the
|
|
// SDK tracks the parent's PID, which exits immediately, causing the
|
|
// SDK's "process exited" goroutine to tear down the API socket while
|
|
// firecracker is still booting in the child. Left out intentionally.
|
|
"--",
|
|
"--api-sock", JailerSocketName,
|
|
}
|
|
return cfg.Jailer.Binary, args
|
|
}
|
|
|
|
func newLogger(base *slog.Logger) *logrus.Entry {
|
|
logger := logrus.New()
|
|
logger.SetOutput(io.Discard)
|
|
logger.SetLevel(logrus.DebugLevel)
|
|
logger.AddHook(slogHook{logger: base})
|
|
return logrus.NewEntry(logger)
|
|
}
|
|
|
|
func HealthVSock(ctx context.Context, logger *slog.Logger, socketPath string) error {
|
|
return vsockagent.Health(ctx, logger, socketPath)
|
|
}
|
|
|
|
func PingVSock(ctx context.Context, logger *slog.Logger, socketPath string) error {
|
|
return HealthVSock(ctx, logger, socketPath)
|
|
}
|
|
|
|
type slogHook struct {
|
|
logger *slog.Logger
|
|
}
|
|
|
|
func (h slogHook) Levels() []logrus.Level {
|
|
return logrus.AllLevels
|
|
}
|
|
|
|
func (h slogHook) Fire(entry *logrus.Entry) error {
|
|
if h.logger == nil {
|
|
return nil
|
|
}
|
|
level := slog.LevelDebug
|
|
switch entry.Level {
|
|
case logrus.PanicLevel, logrus.FatalLevel, logrus.ErrorLevel:
|
|
level = slog.LevelError
|
|
case logrus.WarnLevel:
|
|
level = slog.LevelWarn
|
|
default:
|
|
level = slog.LevelDebug
|
|
}
|
|
attrs := make([]any, 0, len(entry.Data)*2+2)
|
|
attrs = append(attrs, "component", "firecracker_sdk")
|
|
for key, value := range entry.Data {
|
|
attrs = append(attrs, key, value)
|
|
}
|
|
h.logger.Log(context.Background(), level, entry.Message, attrs...)
|
|
return nil
|
|
}
|
|
|
|
func (m *Machine) closeLog() {
|
|
m.closeOnce.Do(func() {
|
|
if m.logFile != nil {
|
|
_ = m.logFile.Close()
|
|
}
|
|
})
|
|
}
|