firecracker: adopt firecracker-jailer for VM launch (Phase B)

Each VM's firecracker now runs inside a per-VM chroot dropped to the
registered owner UID via firecracker-jailer. Closes the broad ambient-
sudo escalation surface that survived Phase A: the helper still needs
caps for tap/bridge/dm/loop/iptables, but the VMM itself no longer
runs as root in the host root filesystem.

The host helper stages each chroot up front: hard-links the kernel
and (optional) initrd, mknods block-device drives + /dev/vhost-vsock,
copies in the firecracker binary (jailer opens it O_RDWR so a ro bind
fails with EROFS), and bind-mounts /usr/lib + /lib trees read-only so
the dynamic linker can resolve. Self-binds the chroot first so the
findmnt-guarded cleanup can recurse safely.

AF_UNIX sun_path is 108 bytes; the chroot path easily blows past that.
Daemon-side launch pre-symlinks the short request socket path to the
long chroot socket before Machine.Start so the SDK's poll/connect
sees the short path while the kernel resolves to the chroot socket.
--new-pid-ns is intentionally disabled — jailer's PID-namespace fork
makes the SDK see the parent exit and tear the API socket down too
early.

CapabilityBoundingSet for the helper expands to add CAP_FOWNER,
CAP_KILL, CAP_MKNOD, CAP_SETGID, CAP_SETUID, CAP_SYS_CHROOT alongside
the existing CAP_CHOWN/CAP_DAC_OVERRIDE/CAP_NET_ADMIN/CAP_NET_RAW/
CAP_SYS_ADMIN.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Thales Maciel 2026-04-28 14:38:07 -03:00
parent d73efe6fbc
commit 6b543cb17f
No known key found for this signature in database
GPG key ID: 33112E6833C34679
12 changed files with 864 additions and 56 deletions

View file

@ -3,8 +3,10 @@ package daemon
import (
"context"
"errors"
"fmt"
"log/slog"
"os"
"path/filepath"
"strconv"
"strings"
"syscall"
@ -39,6 +41,7 @@ type privilegedOps interface {
KillProcess(context.Context, int) error
SignalProcess(context.Context, int, string) error
ProcessRunning(context.Context, int, string) (bool, error)
CleanupJailerChroot(context.Context, string) error
}
type localPrivilegedOps struct {
@ -170,7 +173,77 @@ func (o *localPrivilegedOps) ResolveFirecrackerBinary(_ context.Context, request
}
func (o *localPrivilegedOps) LaunchFirecracker(ctx context.Context, req roothelper.FirecrackerLaunchRequest) (int, error) {
machine, err := firecracker.NewMachine(ctx, firecracker.MachineConfig{
mc, err := o.buildLaunchMachineConfig(ctx, req)
if err != nil {
return 0, err
}
// Symlink before Start: with jailer the actual API socket lives at
// `<chroot>/firecracker.socket` (~120+ bytes — over the AF_UNIX
// sun_path limit of 108). The SDK's waitForSocket and connect(2)
// would EINVAL on the long path. Pre-creating the symlink at the
// short req.SocketPath lets the SDK poll/connect via the short
// path; the kernel only enforces sun_path on the path you pass,
// not on the resolved target.
if err := o.exposeJailerSockets(req); err != nil {
return 0, fmt.Errorf("expose jailer sockets: %w", err)
}
machine, err := firecracker.NewMachine(ctx, mc)
if err != nil {
return 0, err
}
chownDone := o.maybeChownSockets(ctx, req, mc)
startErr := machine.Start(ctx)
chownErr := <-chownDone
if startErr != nil {
if pid := o.fc().ResolvePID(context.Background(), machine, mc.SocketPath); pid > 0 {
_ = o.KillProcess(context.Background(), pid)
}
return 0, startErr
}
if chownErr != nil {
return 0, chownErr
}
if req.Jailer == nil {
// Belt-and-suspenders for the legacy direct-firecracker path.
// The jailer path doesn't need this — firecracker drops to the
// configured uid before creating the socket.
if err := o.EnsureSocketAccess(ctx, mc.SocketPath, "firecracker api socket"); err != nil {
return 0, err
}
if strings.TrimSpace(mc.VSockPath) != "" {
if err := o.EnsureSocketAccess(ctx, mc.VSockPath, "firecracker vsock socket"); err != nil {
return 0, err
}
}
}
pid := o.fc().ResolvePID(context.Background(), machine, mc.SocketPath)
if pid <= 0 {
return 0, errors.New("firecracker started but pid could not be resolved")
}
return pid, nil
}
// maybeChownSockets runs the post-Start sudo-chown race only on the legacy
// direct-firecracker path. With the jailer the firecracker process is
// already running as the configured uid before it creates the socket, so
// no chown is needed (and chown on the symlink would tweak the symlink's
// metadata — not the target's — anyway).
func (o *localPrivilegedOps) maybeChownSockets(ctx context.Context, req roothelper.FirecrackerLaunchRequest, mc firecracker.MachineConfig) <-chan error {
if req.Jailer != nil {
ch := make(chan error, 1)
ch <- nil
close(ch)
return ch
}
return o.fc().EnsureSocketAccessForAsync(ctx, []string{mc.SocketPath, mc.VSockPath}, o.clientUID, o.clientGID)
}
// buildLaunchMachineConfig mirrors the helper-side equivalent: when jailer
// is enabled, stage the chroot tree and rewrite the path fields to their
// chroot-translated form (host-visible for sockets, chroot-internal for
// kernel/drives — see firecracker.MachineConfig.Jailer doc).
func (o *localPrivilegedOps) buildLaunchMachineConfig(ctx context.Context, req roothelper.FirecrackerLaunchRequest) (firecracker.MachineConfig, error) {
mc := firecracker.MachineConfig{
BinaryPath: req.BinaryPath,
VMID: req.VMID,
SocketPath: req.SocketPath,
@ -186,40 +259,101 @@ func (o *localPrivilegedOps) LaunchFirecracker(ctx context.Context, req roothelp
VCPUCount: req.VCPUCount,
MemoryMiB: req.MemoryMiB,
Logger: o.logger,
})
if err != nil {
return 0, err
}
// Race the chown against the SDK's HTTP probe inside Start: when the
// daemon is non-root, firecracker is launched under sudo and the API
// socket appears root-owned. Without a concurrent chown the SDK's
// connect(2) gets EACCES and Start times out before our post-Start
// EnsureSocketAccess can ever run.
chownDone := o.fc().EnsureSocketAccessForAsync(ctx, []string{req.SocketPath, req.VSockPath}, o.clientUID, o.clientGID)
startErr := machine.Start(ctx)
chownErr := <-chownDone
if startErr != nil {
if pid := o.fc().ResolvePID(context.Background(), machine, req.SocketPath); pid > 0 {
_ = o.KillProcess(context.Background(), pid)
}
return 0, startErr
if req.Jailer == nil {
return mc, nil
}
if chownErr != nil {
return 0, chownErr
chrootRoot := firecracker.JailerChrootRoot(req.Jailer.ChrootBaseDir, req.VMID)
driveSpecs := make([]fcproc.ChrootDriveSpec, 0, len(req.Drives))
chrootDrives := make([]firecracker.DriveConfig, 0, len(req.Drives))
for _, d := range req.Drives {
name := chrootDriveName(d)
driveSpecs = append(driveSpecs, fcproc.ChrootDriveSpec{ChrootName: name, HostPath: d.Path})
chrootDrives = append(chrootDrives, firecracker.DriveConfig{
ID: d.ID,
Path: "/" + name,
ReadOnly: d.ReadOnly,
IsRoot: d.IsRoot,
})
}
if err := o.EnsureSocketAccess(ctx, req.SocketPath, "firecracker api socket"); err != nil {
return 0, err
wantVSock := strings.TrimSpace(req.VSockPath) != ""
if err := o.fc().PrepareJailerChroot(ctx, chrootRoot,
req.Jailer.UID, req.Jailer.GID,
req.BinaryPath,
req.KernelImagePath, "vmlinux",
req.InitrdPath, "initrd",
driveSpecs, wantVSock,
); err != nil {
return firecracker.MachineConfig{}, fmt.Errorf("prepare jailer chroot: %w", err)
}
// SocketPath stays the short request path: the SDK polls/connects
// to it via os.Stat / net.Dial("unix", ...), and AF_UNIX sun_path
// is hard-capped at 108 bytes — the actual chroot path is well over
// that. exposeJailerSockets pre-creates the req.SocketPath as a
// symlink whose target is the long chroot socket; the kernel only
// enforces sun_path on the path you hand to connect, not on the
// resolved target.
//
// VSockPath, by contrast, is sent to firecracker via the API and
// resolved from inside the chroot, so it must be the chroot-internal
// path. The host-visible vsock socket is reachable via a symlink
// at req.VSockPath, also installed by exposeJailerSockets.
_ = chrootRoot
if wantVSock {
mc.VSockPath = firecracker.JailerVSockName
}
mc.KernelImagePath = "/vmlinux"
if strings.TrimSpace(req.InitrdPath) != "" {
mc.InitrdPath = "/initrd"
} else {
mc.InitrdPath = ""
}
mc.Drives = chrootDrives
// LogPath stays set so buildProcessRunner's openLogFile captures firecracker
// stderr via cmd.Stderr. buildConfig clears sdk.Config.LogPath for jailer
// mode to avoid PUT /logger with a host path firecracker can't open.
mc.MetricsPath = ""
mc.Jailer = &firecracker.JailerOpts{
Binary: req.Jailer.Binary,
ChrootBaseDir: req.Jailer.ChrootBaseDir,
UID: req.Jailer.UID,
GID: req.Jailer.GID,
}
return mc, nil
}
func (o *localPrivilegedOps) exposeJailerSockets(req roothelper.FirecrackerLaunchRequest) error {
if req.Jailer == nil {
return nil
}
chrootRoot := firecracker.JailerChrootRoot(req.Jailer.ChrootBaseDir, req.VMID)
hostAPI := filepath.Join(chrootRoot, strings.TrimPrefix(firecracker.JailerSocketName, "/"))
if err := atomicSymlink(hostAPI, req.SocketPath); err != nil {
return err
}
if strings.TrimSpace(req.VSockPath) != "" {
if err := o.EnsureSocketAccess(ctx, req.VSockPath, "firecracker vsock socket"); err != nil {
return 0, err
hostVSock := filepath.Join(chrootRoot, strings.TrimPrefix(firecracker.JailerVSockName, "/"))
if err := atomicSymlink(hostVSock, req.VSockPath); err != nil {
return err
}
}
pid := o.fc().ResolvePID(context.Background(), machine, req.SocketPath)
if pid <= 0 {
return 0, errors.New("firecracker started but pid could not be resolved")
return nil
}
// chrootDriveName mirrors the helper-side helper of the same name; kept as
// a free function so both paths produce identical chroot layouts.
func chrootDriveName(d firecracker.DriveConfig) string {
if id := strings.TrimSpace(d.ID); id != "" {
return id
}
return pid, nil
return filepath.Base(d.Path)
}
func atomicSymlink(target, link string) error {
if err := os.Remove(link); err != nil && !os.IsNotExist(err) {
return err
}
return os.Symlink(target, link)
}
func (o *localPrivilegedOps) EnsureSocketAccess(ctx context.Context, socketPath, label string) error {
@ -246,6 +380,10 @@ func (o *localPrivilegedOps) ProcessRunning(_ context.Context, pid int, apiSock
return system.ProcessRunning(pid, apiSock), nil
}
func (o *localPrivilegedOps) CleanupJailerChroot(ctx context.Context, chrootRoot string) error {
return o.fc().CleanupJailerChroot(ctx, chrootRoot)
}
func (o *localPrivilegedOps) fc() *fcproc.Manager {
return fcproc.New(o.runner, fcproc.Config{
FirecrackerBin: normalizeFirecrackerBinary("", o.config.FirecrackerBin),
@ -320,7 +458,27 @@ func (o *helperPrivilegedOps) ResolveFirecrackerBinary(ctx context.Context, requ
func (o *helperPrivilegedOps) LaunchFirecracker(ctx context.Context, req roothelper.FirecrackerLaunchRequest) (int, error) {
req.Network = o.networkConfig()
return o.client.LaunchFirecracker(ctx, req)
pid, err := o.client.LaunchFirecracker(ctx, req)
if err != nil {
return 0, err
}
// The root helper runs with PrivateMounts=yes, so symlinks it creates
// (exposeJailerSockets) are invisible to the daemon's namespace. Re-create
// them here so the daemon can reach the API and vsock sockets.
if req.Jailer != nil {
chrootRoot := firecracker.JailerChrootRoot(req.Jailer.ChrootBaseDir, req.VMID)
hostAPI := filepath.Join(chrootRoot, strings.TrimPrefix(firecracker.JailerSocketName, "/"))
if err := atomicSymlink(hostAPI, req.SocketPath); err != nil {
return 0, fmt.Errorf("api socket symlink: %w", err)
}
if strings.TrimSpace(req.VSockPath) != "" {
hostVSock := filepath.Join(chrootRoot, strings.TrimPrefix(firecracker.JailerVSockName, "/"))
if err := atomicSymlink(hostVSock, req.VSockPath); err != nil {
return 0, fmt.Errorf("vsock symlink: %w", err)
}
}
}
return pid, nil
}
func (o *helperPrivilegedOps) EnsureSocketAccess(ctx context.Context, socketPath, label string) error {
@ -348,6 +506,10 @@ func (o *helperPrivilegedOps) ProcessRunning(ctx context.Context, pid int, apiSo
return o.client.ProcessRunning(ctx, pid, apiSock)
}
func (o *helperPrivilegedOps) CleanupJailerChroot(ctx context.Context, chrootRoot string) error {
return o.client.CleanupJailerChroot(ctx, chrootRoot)
}
func (o *helperPrivilegedOps) networkConfig() roothelper.NetworkConfig {
return roothelper.NetworkConfig{
BridgeName: o.config.BridgeName,