banger/internal/daemon/privileged_ops.go
Thales Maciel 6b543cb17f
firecracker: adopt firecracker-jailer for VM launch (Phase B)
Each VM's firecracker now runs inside a per-VM chroot dropped to the
registered owner UID via firecracker-jailer. Closes the broad ambient-
sudo escalation surface that survived Phase A: the helper still needs
caps for tap/bridge/dm/loop/iptables, but the VMM itself no longer
runs as root in the host root filesystem.

The host helper stages each chroot up front: hard-links the kernel
and (optional) initrd, mknods block-device drives + /dev/vhost-vsock,
copies in the firecracker binary (jailer opens it O_RDWR so a ro bind
fails with EROFS), and bind-mounts /usr/lib + /lib trees read-only so
the dynamic linker can resolve. Self-binds the chroot first so the
findmnt-guarded cleanup can recurse safely.

AF_UNIX sun_path is 108 bytes; the chroot path easily blows past that.
Daemon-side launch pre-symlinks the short request socket path to the
long chroot socket before Machine.Start so the SDK's poll/connect
sees the short path while the kernel resolves to the chroot socket.
--new-pid-ns is intentionally disabled — jailer's PID-namespace fork
makes the SDK see the parent exit and tear the API socket down too
early.

CapabilityBoundingSet for the helper expands to add CAP_FOWNER,
CAP_KILL, CAP_MKNOD, CAP_SETGID, CAP_SETUID, CAP_SYS_CHROOT alongside
the existing CAP_CHOWN/CAP_DAC_OVERRIDE/CAP_NET_ADMIN/CAP_NET_RAW/
CAP_SYS_ADMIN.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-28 14:38:07 -03:00

527 lines
18 KiB
Go

package daemon
import (
"context"
"errors"
"fmt"
"log/slog"
"os"
"path/filepath"
"strconv"
"strings"
"syscall"
"banger/internal/daemon/dmsnap"
"banger/internal/daemon/fcproc"
"banger/internal/firecracker"
"banger/internal/hostnat"
"banger/internal/model"
"banger/internal/paths"
"banger/internal/roothelper"
"banger/internal/system"
)
type privilegedOps interface {
EnsureBridge(context.Context) error
CreateTap(context.Context, string) error
DeleteTap(context.Context, string) error
SyncResolverRouting(context.Context, string) error
ClearResolverRouting(context.Context) error
EnsureNAT(context.Context, string, string, bool) error
CreateDMSnapshot(context.Context, string, string, string) (dmSnapshotHandles, error)
CleanupDMSnapshot(context.Context, dmSnapshotHandles) error
RemoveDMSnapshot(context.Context, string) error
FsckSnapshot(context.Context, string) error
ReadExt4File(context.Context, string, string) ([]byte, error)
WriteExt4Files(context.Context, string, []roothelper.Ext4Write) error
ResolveFirecrackerBinary(context.Context, string) (string, error)
LaunchFirecracker(context.Context, roothelper.FirecrackerLaunchRequest) (int, error)
EnsureSocketAccess(context.Context, string, string) error
FindFirecrackerPID(context.Context, string) (int, error)
KillProcess(context.Context, int) error
SignalProcess(context.Context, int, string) error
ProcessRunning(context.Context, int, string) (bool, error)
CleanupJailerChroot(context.Context, string) error
}
type localPrivilegedOps struct {
runner system.CommandRunner
logger *slog.Logger
config model.DaemonConfig
layout paths.Layout
clientUID int
clientGID int
}
func (n *HostNetwork) privOps() privilegedOps {
if n.priv == nil {
n.priv = newLocalPrivilegedOps(n.runner, n.logger, n.config, n.layout, os.Getuid(), os.Getgid())
}
return n.priv
}
func (s *VMService) privOps() privilegedOps {
if s.priv == nil {
s.priv = newLocalPrivilegedOps(s.runner, s.logger, s.config, s.layout, os.Getuid(), os.Getgid())
}
return s.priv
}
func newLocalPrivilegedOps(runner system.CommandRunner, logger *slog.Logger, cfg model.DaemonConfig, layout paths.Layout, clientUID, clientGID int) privilegedOps {
if clientUID < 0 {
clientUID = os.Getuid()
}
if clientGID < 0 {
clientGID = os.Getgid()
}
return &localPrivilegedOps{
runner: runner,
logger: logger,
config: cfg,
layout: layout,
clientUID: clientUID,
clientGID: clientGID,
}
}
func (o *localPrivilegedOps) EnsureBridge(ctx context.Context) error {
return o.fc().EnsureBridge(ctx)
}
func (o *localPrivilegedOps) CreateTap(ctx context.Context, tapName string) error {
return o.fc().CreateTapOwned(ctx, tapName, o.clientUID, o.clientGID)
}
func (o *localPrivilegedOps) DeleteTap(ctx context.Context, tapName string) error {
_, err := o.runner.RunSudo(ctx, "ip", "link", "del", tapName)
return err
}
func (o *localPrivilegedOps) SyncResolverRouting(ctx context.Context, serverAddr string) error {
if strings.TrimSpace(o.config.BridgeName) == "" || strings.TrimSpace(serverAddr) == "" {
return nil
}
if _, err := system.LookupExecutable("resolvectl"); err != nil {
return nil
}
if _, err := o.runner.RunSudo(ctx, "resolvectl", "dns", o.config.BridgeName, serverAddr); err != nil {
return err
}
if _, err := o.runner.RunSudo(ctx, "resolvectl", "domain", o.config.BridgeName, vmResolverRouteDomain); err != nil {
return err
}
_, err := o.runner.RunSudo(ctx, "resolvectl", "default-route", o.config.BridgeName, "no")
return err
}
func (o *localPrivilegedOps) ClearResolverRouting(ctx context.Context) error {
if strings.TrimSpace(o.config.BridgeName) == "" {
return nil
}
if _, err := system.LookupExecutable("resolvectl"); err != nil {
return nil
}
_, err := o.runner.RunSudo(ctx, "resolvectl", "revert", o.config.BridgeName)
return err
}
func (o *localPrivilegedOps) EnsureNAT(ctx context.Context, guestIP, tap string, enable bool) error {
return hostnat.Ensure(ctx, o.runner, guestIP, tap, enable)
}
func (o *localPrivilegedOps) CreateDMSnapshot(ctx context.Context, rootfsPath, cowPath, dmName string) (dmSnapshotHandles, error) {
return dmsnap.Create(ctx, o.runner, rootfsPath, cowPath, dmName)
}
func (o *localPrivilegedOps) CleanupDMSnapshot(ctx context.Context, handles dmSnapshotHandles) error {
return dmsnap.Cleanup(ctx, o.runner, handles)
}
func (o *localPrivilegedOps) RemoveDMSnapshot(ctx context.Context, target string) error {
return dmsnap.Remove(ctx, o.runner, target)
}
func (o *localPrivilegedOps) FsckSnapshot(ctx context.Context, dmDev string) error {
if _, err := o.runner.RunSudo(ctx, "e2fsck", "-fy", dmDev); err != nil {
if code := system.ExitCode(err); code < 0 || code > 1 {
return err
}
}
return nil
}
func (o *localPrivilegedOps) ReadExt4File(ctx context.Context, imagePath, guestPath string) ([]byte, error) {
return system.ReadExt4File(ctx, o.runner, imagePath, guestPath)
}
func (o *localPrivilegedOps) WriteExt4Files(ctx context.Context, imagePath string, files []roothelper.Ext4Write) error {
for _, file := range files {
mode := os.FileMode(file.Mode)
if mode == 0 {
mode = 0o644
}
if err := system.WriteExt4FileOwned(ctx, o.runner, imagePath, file.GuestPath, mode, 0, 0, file.Data); err != nil {
return err
}
}
return nil
}
func (o *localPrivilegedOps) ResolveFirecrackerBinary(_ context.Context, requested string) (string, error) {
manager := fcproc.New(o.runner, fcproc.Config{FirecrackerBin: normalizeFirecrackerBinary(requested, o.config.FirecrackerBin)}, o.logger)
return manager.ResolveBinary()
}
func (o *localPrivilegedOps) LaunchFirecracker(ctx context.Context, req roothelper.FirecrackerLaunchRequest) (int, error) {
mc, err := o.buildLaunchMachineConfig(ctx, req)
if err != nil {
return 0, err
}
// Symlink before Start: with jailer the actual API socket lives at
// `<chroot>/firecracker.socket` (~120+ bytes — over the AF_UNIX
// sun_path limit of 108). The SDK's waitForSocket and connect(2)
// would EINVAL on the long path. Pre-creating the symlink at the
// short req.SocketPath lets the SDK poll/connect via the short
// path; the kernel only enforces sun_path on the path you pass,
// not on the resolved target.
if err := o.exposeJailerSockets(req); err != nil {
return 0, fmt.Errorf("expose jailer sockets: %w", err)
}
machine, err := firecracker.NewMachine(ctx, mc)
if err != nil {
return 0, err
}
chownDone := o.maybeChownSockets(ctx, req, mc)
startErr := machine.Start(ctx)
chownErr := <-chownDone
if startErr != nil {
if pid := o.fc().ResolvePID(context.Background(), machine, mc.SocketPath); pid > 0 {
_ = o.KillProcess(context.Background(), pid)
}
return 0, startErr
}
if chownErr != nil {
return 0, chownErr
}
if req.Jailer == nil {
// Belt-and-suspenders for the legacy direct-firecracker path.
// The jailer path doesn't need this — firecracker drops to the
// configured uid before creating the socket.
if err := o.EnsureSocketAccess(ctx, mc.SocketPath, "firecracker api socket"); err != nil {
return 0, err
}
if strings.TrimSpace(mc.VSockPath) != "" {
if err := o.EnsureSocketAccess(ctx, mc.VSockPath, "firecracker vsock socket"); err != nil {
return 0, err
}
}
}
pid := o.fc().ResolvePID(context.Background(), machine, mc.SocketPath)
if pid <= 0 {
return 0, errors.New("firecracker started but pid could not be resolved")
}
return pid, nil
}
// maybeChownSockets runs the post-Start sudo-chown race only on the legacy
// direct-firecracker path. With the jailer the firecracker process is
// already running as the configured uid before it creates the socket, so
// no chown is needed (and chown on the symlink would tweak the symlink's
// metadata — not the target's — anyway).
func (o *localPrivilegedOps) maybeChownSockets(ctx context.Context, req roothelper.FirecrackerLaunchRequest, mc firecracker.MachineConfig) <-chan error {
if req.Jailer != nil {
ch := make(chan error, 1)
ch <- nil
close(ch)
return ch
}
return o.fc().EnsureSocketAccessForAsync(ctx, []string{mc.SocketPath, mc.VSockPath}, o.clientUID, o.clientGID)
}
// buildLaunchMachineConfig mirrors the helper-side equivalent: when jailer
// is enabled, stage the chroot tree and rewrite the path fields to their
// chroot-translated form (host-visible for sockets, chroot-internal for
// kernel/drives — see firecracker.MachineConfig.Jailer doc).
func (o *localPrivilegedOps) buildLaunchMachineConfig(ctx context.Context, req roothelper.FirecrackerLaunchRequest) (firecracker.MachineConfig, error) {
mc := firecracker.MachineConfig{
BinaryPath: req.BinaryPath,
VMID: req.VMID,
SocketPath: req.SocketPath,
LogPath: req.LogPath,
MetricsPath: req.MetricsPath,
KernelImagePath: req.KernelImagePath,
InitrdPath: req.InitrdPath,
KernelArgs: req.KernelArgs,
Drives: req.Drives,
TapDevice: req.TapDevice,
VSockPath: req.VSockPath,
VSockCID: req.VSockCID,
VCPUCount: req.VCPUCount,
MemoryMiB: req.MemoryMiB,
Logger: o.logger,
}
if req.Jailer == nil {
return mc, nil
}
chrootRoot := firecracker.JailerChrootRoot(req.Jailer.ChrootBaseDir, req.VMID)
driveSpecs := make([]fcproc.ChrootDriveSpec, 0, len(req.Drives))
chrootDrives := make([]firecracker.DriveConfig, 0, len(req.Drives))
for _, d := range req.Drives {
name := chrootDriveName(d)
driveSpecs = append(driveSpecs, fcproc.ChrootDriveSpec{ChrootName: name, HostPath: d.Path})
chrootDrives = append(chrootDrives, firecracker.DriveConfig{
ID: d.ID,
Path: "/" + name,
ReadOnly: d.ReadOnly,
IsRoot: d.IsRoot,
})
}
wantVSock := strings.TrimSpace(req.VSockPath) != ""
if err := o.fc().PrepareJailerChroot(ctx, chrootRoot,
req.Jailer.UID, req.Jailer.GID,
req.BinaryPath,
req.KernelImagePath, "vmlinux",
req.InitrdPath, "initrd",
driveSpecs, wantVSock,
); err != nil {
return firecracker.MachineConfig{}, fmt.Errorf("prepare jailer chroot: %w", err)
}
// SocketPath stays the short request path: the SDK polls/connects
// to it via os.Stat / net.Dial("unix", ...), and AF_UNIX sun_path
// is hard-capped at 108 bytes — the actual chroot path is well over
// that. exposeJailerSockets pre-creates the req.SocketPath as a
// symlink whose target is the long chroot socket; the kernel only
// enforces sun_path on the path you hand to connect, not on the
// resolved target.
//
// VSockPath, by contrast, is sent to firecracker via the API and
// resolved from inside the chroot, so it must be the chroot-internal
// path. The host-visible vsock socket is reachable via a symlink
// at req.VSockPath, also installed by exposeJailerSockets.
_ = chrootRoot
if wantVSock {
mc.VSockPath = firecracker.JailerVSockName
}
mc.KernelImagePath = "/vmlinux"
if strings.TrimSpace(req.InitrdPath) != "" {
mc.InitrdPath = "/initrd"
} else {
mc.InitrdPath = ""
}
mc.Drives = chrootDrives
// LogPath stays set so buildProcessRunner's openLogFile captures firecracker
// stderr via cmd.Stderr. buildConfig clears sdk.Config.LogPath for jailer
// mode to avoid PUT /logger with a host path firecracker can't open.
mc.MetricsPath = ""
mc.Jailer = &firecracker.JailerOpts{
Binary: req.Jailer.Binary,
ChrootBaseDir: req.Jailer.ChrootBaseDir,
UID: req.Jailer.UID,
GID: req.Jailer.GID,
}
return mc, nil
}
func (o *localPrivilegedOps) exposeJailerSockets(req roothelper.FirecrackerLaunchRequest) error {
if req.Jailer == nil {
return nil
}
chrootRoot := firecracker.JailerChrootRoot(req.Jailer.ChrootBaseDir, req.VMID)
hostAPI := filepath.Join(chrootRoot, strings.TrimPrefix(firecracker.JailerSocketName, "/"))
if err := atomicSymlink(hostAPI, req.SocketPath); err != nil {
return err
}
if strings.TrimSpace(req.VSockPath) != "" {
hostVSock := filepath.Join(chrootRoot, strings.TrimPrefix(firecracker.JailerVSockName, "/"))
if err := atomicSymlink(hostVSock, req.VSockPath); err != nil {
return err
}
}
return nil
}
// chrootDriveName mirrors the helper-side helper of the same name; kept as
// a free function so both paths produce identical chroot layouts.
func chrootDriveName(d firecracker.DriveConfig) string {
if id := strings.TrimSpace(d.ID); id != "" {
return id
}
return filepath.Base(d.Path)
}
func atomicSymlink(target, link string) error {
if err := os.Remove(link); err != nil && !os.IsNotExist(err) {
return err
}
return os.Symlink(target, link)
}
func (o *localPrivilegedOps) EnsureSocketAccess(ctx context.Context, socketPath, label string) error {
return o.fc().EnsureSocketAccessFor(ctx, socketPath, label, o.clientUID, o.clientGID)
}
func (o *localPrivilegedOps) FindFirecrackerPID(ctx context.Context, apiSock string) (int, error) {
return o.fc().FindPID(ctx, apiSock)
}
func (o *localPrivilegedOps) KillProcess(ctx context.Context, pid int) error {
return o.fc().Kill(ctx, pid)
}
func (o *localPrivilegedOps) SignalProcess(ctx context.Context, pid int, signal string) error {
if strings.TrimSpace(signal) == "" {
signal = "TERM"
}
_, err := o.runner.RunSudo(ctx, "kill", "-"+signal, strconv.Itoa(pid))
return err
}
func (o *localPrivilegedOps) ProcessRunning(_ context.Context, pid int, apiSock string) (bool, error) {
return system.ProcessRunning(pid, apiSock), nil
}
func (o *localPrivilegedOps) CleanupJailerChroot(ctx context.Context, chrootRoot string) error {
return o.fc().CleanupJailerChroot(ctx, chrootRoot)
}
func (o *localPrivilegedOps) fc() *fcproc.Manager {
return fcproc.New(o.runner, fcproc.Config{
FirecrackerBin: normalizeFirecrackerBinary("", o.config.FirecrackerBin),
BridgeName: o.config.BridgeName,
BridgeIP: o.config.BridgeIP,
CIDR: o.config.CIDR,
RuntimeDir: o.layout.RuntimeDir,
}, o.logger)
}
type helperPrivilegedOps struct {
client *roothelper.Client
config model.DaemonConfig
layout paths.Layout
}
func newHelperPrivilegedOps(client *roothelper.Client, cfg model.DaemonConfig, layout paths.Layout) privilegedOps {
return &helperPrivilegedOps{client: client, config: cfg, layout: layout}
}
func (o *helperPrivilegedOps) EnsureBridge(ctx context.Context) error {
return o.client.EnsureBridge(ctx, o.networkConfig())
}
func (o *helperPrivilegedOps) CreateTap(ctx context.Context, tapName string) error {
return o.client.CreateTap(ctx, o.networkConfig(), tapName)
}
func (o *helperPrivilegedOps) DeleteTap(ctx context.Context, tapName string) error {
return o.client.DeleteTap(ctx, tapName)
}
func (o *helperPrivilegedOps) SyncResolverRouting(ctx context.Context, serverAddr string) error {
return o.client.SyncResolverRouting(ctx, o.config.BridgeName, serverAddr)
}
func (o *helperPrivilegedOps) ClearResolverRouting(ctx context.Context) error {
return o.client.ClearResolverRouting(ctx, o.config.BridgeName)
}
func (o *helperPrivilegedOps) EnsureNAT(ctx context.Context, guestIP, tap string, enable bool) error {
return o.client.EnsureNAT(ctx, guestIP, tap, enable)
}
func (o *helperPrivilegedOps) CreateDMSnapshot(ctx context.Context, rootfsPath, cowPath, dmName string) (dmSnapshotHandles, error) {
return o.client.CreateDMSnapshot(ctx, rootfsPath, cowPath, dmName)
}
func (o *helperPrivilegedOps) CleanupDMSnapshot(ctx context.Context, handles dmSnapshotHandles) error {
return o.client.CleanupDMSnapshot(ctx, handles)
}
func (o *helperPrivilegedOps) RemoveDMSnapshot(ctx context.Context, target string) error {
return o.client.RemoveDMSnapshot(ctx, target)
}
func (o *helperPrivilegedOps) FsckSnapshot(ctx context.Context, dmDev string) error {
return o.client.FsckSnapshot(ctx, dmDev)
}
func (o *helperPrivilegedOps) ReadExt4File(ctx context.Context, imagePath, guestPath string) ([]byte, error) {
return o.client.ReadExt4File(ctx, imagePath, guestPath)
}
func (o *helperPrivilegedOps) WriteExt4Files(ctx context.Context, imagePath string, files []roothelper.Ext4Write) error {
return o.client.WriteExt4Files(ctx, imagePath, files)
}
func (o *helperPrivilegedOps) ResolveFirecrackerBinary(ctx context.Context, requested string) (string, error) {
return o.client.ResolveFirecrackerBinary(ctx, normalizeFirecrackerBinary(requested, o.config.FirecrackerBin))
}
func (o *helperPrivilegedOps) LaunchFirecracker(ctx context.Context, req roothelper.FirecrackerLaunchRequest) (int, error) {
req.Network = o.networkConfig()
pid, err := o.client.LaunchFirecracker(ctx, req)
if err != nil {
return 0, err
}
// The root helper runs with PrivateMounts=yes, so symlinks it creates
// (exposeJailerSockets) are invisible to the daemon's namespace. Re-create
// them here so the daemon can reach the API and vsock sockets.
if req.Jailer != nil {
chrootRoot := firecracker.JailerChrootRoot(req.Jailer.ChrootBaseDir, req.VMID)
hostAPI := filepath.Join(chrootRoot, strings.TrimPrefix(firecracker.JailerSocketName, "/"))
if err := atomicSymlink(hostAPI, req.SocketPath); err != nil {
return 0, fmt.Errorf("api socket symlink: %w", err)
}
if strings.TrimSpace(req.VSockPath) != "" {
hostVSock := filepath.Join(chrootRoot, strings.TrimPrefix(firecracker.JailerVSockName, "/"))
if err := atomicSymlink(hostVSock, req.VSockPath); err != nil {
return 0, fmt.Errorf("vsock symlink: %w", err)
}
}
}
return pid, nil
}
func (o *helperPrivilegedOps) EnsureSocketAccess(ctx context.Context, socketPath, label string) error {
if info, err := os.Stat(socketPath); err == nil {
if stat, ok := info.Sys().(*syscall.Stat_t); ok && int(stat.Uid) == os.Getuid() {
return os.Chmod(socketPath, 0o600)
}
}
return o.client.EnsureSocketAccess(ctx, socketPath, label)
}
func (o *helperPrivilegedOps) FindFirecrackerPID(ctx context.Context, apiSock string) (int, error) {
return o.client.FindFirecrackerPID(ctx, apiSock)
}
func (o *helperPrivilegedOps) KillProcess(ctx context.Context, pid int) error {
return o.client.KillProcess(ctx, pid)
}
func (o *helperPrivilegedOps) SignalProcess(ctx context.Context, pid int, signal string) error {
return o.client.SignalProcess(ctx, pid, signal)
}
func (o *helperPrivilegedOps) ProcessRunning(ctx context.Context, pid int, apiSock string) (bool, error) {
return o.client.ProcessRunning(ctx, pid, apiSock)
}
func (o *helperPrivilegedOps) CleanupJailerChroot(ctx context.Context, chrootRoot string) error {
return o.client.CleanupJailerChroot(ctx, chrootRoot)
}
func (o *helperPrivilegedOps) networkConfig() roothelper.NetworkConfig {
return roothelper.NetworkConfig{
BridgeName: o.config.BridgeName,
BridgeIP: o.config.BridgeIP,
CIDR: o.config.CIDR,
}
}
func normalizeFirecrackerBinary(requested, configured string) string {
requested = strings.TrimSpace(requested)
if requested != "" {
return requested
}
return strings.TrimSpace(configured)
}