firecracker: adopt firecracker-jailer for VM launch (Phase B)
Each VM's firecracker now runs inside a per-VM chroot dropped to the registered owner UID via firecracker-jailer. Closes the broad ambient- sudo escalation surface that survived Phase A: the helper still needs caps for tap/bridge/dm/loop/iptables, but the VMM itself no longer runs as root in the host root filesystem. The host helper stages each chroot up front: hard-links the kernel and (optional) initrd, mknods block-device drives + /dev/vhost-vsock, copies in the firecracker binary (jailer opens it O_RDWR so a ro bind fails with EROFS), and bind-mounts /usr/lib + /lib trees read-only so the dynamic linker can resolve. Self-binds the chroot first so the findmnt-guarded cleanup can recurse safely. AF_UNIX sun_path is 108 bytes; the chroot path easily blows past that. Daemon-side launch pre-symlinks the short request socket path to the long chroot socket before Machine.Start so the SDK's poll/connect sees the short path while the kernel resolves to the chroot socket. --new-pid-ns is intentionally disabled — jailer's PID-namespace fork makes the SDK see the parent exit and tear the API socket down too early. CapabilityBoundingSet for the helper expands to add CAP_FOWNER, CAP_KILL, CAP_MKNOD, CAP_SETGID, CAP_SETUID, CAP_SYS_CHROOT alongside the existing CAP_CHOWN/CAP_DAC_OVERRIDE/CAP_NET_ADMIN/CAP_NET_RAW/ CAP_SYS_ADMIN. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
d73efe6fbc
commit
6b543cb17f
12 changed files with 864 additions and 56 deletions
|
|
@ -3,8 +3,10 @@ package daemon
|
|||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
|
|
@ -39,6 +41,7 @@ type privilegedOps interface {
|
|||
KillProcess(context.Context, int) error
|
||||
SignalProcess(context.Context, int, string) error
|
||||
ProcessRunning(context.Context, int, string) (bool, error)
|
||||
CleanupJailerChroot(context.Context, string) error
|
||||
}
|
||||
|
||||
type localPrivilegedOps struct {
|
||||
|
|
@ -170,7 +173,77 @@ func (o *localPrivilegedOps) ResolveFirecrackerBinary(_ context.Context, request
|
|||
}
|
||||
|
||||
func (o *localPrivilegedOps) LaunchFirecracker(ctx context.Context, req roothelper.FirecrackerLaunchRequest) (int, error) {
|
||||
machine, err := firecracker.NewMachine(ctx, firecracker.MachineConfig{
|
||||
mc, err := o.buildLaunchMachineConfig(ctx, req)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
// Symlink before Start: with jailer the actual API socket lives at
|
||||
// `<chroot>/firecracker.socket` (~120+ bytes — over the AF_UNIX
|
||||
// sun_path limit of 108). The SDK's waitForSocket and connect(2)
|
||||
// would EINVAL on the long path. Pre-creating the symlink at the
|
||||
// short req.SocketPath lets the SDK poll/connect via the short
|
||||
// path; the kernel only enforces sun_path on the path you pass,
|
||||
// not on the resolved target.
|
||||
if err := o.exposeJailerSockets(req); err != nil {
|
||||
return 0, fmt.Errorf("expose jailer sockets: %w", err)
|
||||
}
|
||||
machine, err := firecracker.NewMachine(ctx, mc)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
chownDone := o.maybeChownSockets(ctx, req, mc)
|
||||
startErr := machine.Start(ctx)
|
||||
chownErr := <-chownDone
|
||||
if startErr != nil {
|
||||
if pid := o.fc().ResolvePID(context.Background(), machine, mc.SocketPath); pid > 0 {
|
||||
_ = o.KillProcess(context.Background(), pid)
|
||||
}
|
||||
return 0, startErr
|
||||
}
|
||||
if chownErr != nil {
|
||||
return 0, chownErr
|
||||
}
|
||||
if req.Jailer == nil {
|
||||
// Belt-and-suspenders for the legacy direct-firecracker path.
|
||||
// The jailer path doesn't need this — firecracker drops to the
|
||||
// configured uid before creating the socket.
|
||||
if err := o.EnsureSocketAccess(ctx, mc.SocketPath, "firecracker api socket"); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
if strings.TrimSpace(mc.VSockPath) != "" {
|
||||
if err := o.EnsureSocketAccess(ctx, mc.VSockPath, "firecracker vsock socket"); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
}
|
||||
pid := o.fc().ResolvePID(context.Background(), machine, mc.SocketPath)
|
||||
if pid <= 0 {
|
||||
return 0, errors.New("firecracker started but pid could not be resolved")
|
||||
}
|
||||
return pid, nil
|
||||
}
|
||||
|
||||
// maybeChownSockets runs the post-Start sudo-chown race only on the legacy
|
||||
// direct-firecracker path. With the jailer the firecracker process is
|
||||
// already running as the configured uid before it creates the socket, so
|
||||
// no chown is needed (and chown on the symlink would tweak the symlink's
|
||||
// metadata — not the target's — anyway).
|
||||
func (o *localPrivilegedOps) maybeChownSockets(ctx context.Context, req roothelper.FirecrackerLaunchRequest, mc firecracker.MachineConfig) <-chan error {
|
||||
if req.Jailer != nil {
|
||||
ch := make(chan error, 1)
|
||||
ch <- nil
|
||||
close(ch)
|
||||
return ch
|
||||
}
|
||||
return o.fc().EnsureSocketAccessForAsync(ctx, []string{mc.SocketPath, mc.VSockPath}, o.clientUID, o.clientGID)
|
||||
}
|
||||
|
||||
// buildLaunchMachineConfig mirrors the helper-side equivalent: when jailer
|
||||
// is enabled, stage the chroot tree and rewrite the path fields to their
|
||||
// chroot-translated form (host-visible for sockets, chroot-internal for
|
||||
// kernel/drives — see firecracker.MachineConfig.Jailer doc).
|
||||
func (o *localPrivilegedOps) buildLaunchMachineConfig(ctx context.Context, req roothelper.FirecrackerLaunchRequest) (firecracker.MachineConfig, error) {
|
||||
mc := firecracker.MachineConfig{
|
||||
BinaryPath: req.BinaryPath,
|
||||
VMID: req.VMID,
|
||||
SocketPath: req.SocketPath,
|
||||
|
|
@ -186,40 +259,101 @@ func (o *localPrivilegedOps) LaunchFirecracker(ctx context.Context, req roothelp
|
|||
VCPUCount: req.VCPUCount,
|
||||
MemoryMiB: req.MemoryMiB,
|
||||
Logger: o.logger,
|
||||
})
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
// Race the chown against the SDK's HTTP probe inside Start: when the
|
||||
// daemon is non-root, firecracker is launched under sudo and the API
|
||||
// socket appears root-owned. Without a concurrent chown the SDK's
|
||||
// connect(2) gets EACCES and Start times out before our post-Start
|
||||
// EnsureSocketAccess can ever run.
|
||||
chownDone := o.fc().EnsureSocketAccessForAsync(ctx, []string{req.SocketPath, req.VSockPath}, o.clientUID, o.clientGID)
|
||||
startErr := machine.Start(ctx)
|
||||
chownErr := <-chownDone
|
||||
if startErr != nil {
|
||||
if pid := o.fc().ResolvePID(context.Background(), machine, req.SocketPath); pid > 0 {
|
||||
_ = o.KillProcess(context.Background(), pid)
|
||||
}
|
||||
return 0, startErr
|
||||
if req.Jailer == nil {
|
||||
return mc, nil
|
||||
}
|
||||
if chownErr != nil {
|
||||
return 0, chownErr
|
||||
chrootRoot := firecracker.JailerChrootRoot(req.Jailer.ChrootBaseDir, req.VMID)
|
||||
driveSpecs := make([]fcproc.ChrootDriveSpec, 0, len(req.Drives))
|
||||
chrootDrives := make([]firecracker.DriveConfig, 0, len(req.Drives))
|
||||
for _, d := range req.Drives {
|
||||
name := chrootDriveName(d)
|
||||
driveSpecs = append(driveSpecs, fcproc.ChrootDriveSpec{ChrootName: name, HostPath: d.Path})
|
||||
chrootDrives = append(chrootDrives, firecracker.DriveConfig{
|
||||
ID: d.ID,
|
||||
Path: "/" + name,
|
||||
ReadOnly: d.ReadOnly,
|
||||
IsRoot: d.IsRoot,
|
||||
})
|
||||
}
|
||||
if err := o.EnsureSocketAccess(ctx, req.SocketPath, "firecracker api socket"); err != nil {
|
||||
return 0, err
|
||||
wantVSock := strings.TrimSpace(req.VSockPath) != ""
|
||||
if err := o.fc().PrepareJailerChroot(ctx, chrootRoot,
|
||||
req.Jailer.UID, req.Jailer.GID,
|
||||
req.BinaryPath,
|
||||
req.KernelImagePath, "vmlinux",
|
||||
req.InitrdPath, "initrd",
|
||||
driveSpecs, wantVSock,
|
||||
); err != nil {
|
||||
return firecracker.MachineConfig{}, fmt.Errorf("prepare jailer chroot: %w", err)
|
||||
}
|
||||
// SocketPath stays the short request path: the SDK polls/connects
|
||||
// to it via os.Stat / net.Dial("unix", ...), and AF_UNIX sun_path
|
||||
// is hard-capped at 108 bytes — the actual chroot path is well over
|
||||
// that. exposeJailerSockets pre-creates the req.SocketPath as a
|
||||
// symlink whose target is the long chroot socket; the kernel only
|
||||
// enforces sun_path on the path you hand to connect, not on the
|
||||
// resolved target.
|
||||
//
|
||||
// VSockPath, by contrast, is sent to firecracker via the API and
|
||||
// resolved from inside the chroot, so it must be the chroot-internal
|
||||
// path. The host-visible vsock socket is reachable via a symlink
|
||||
// at req.VSockPath, also installed by exposeJailerSockets.
|
||||
_ = chrootRoot
|
||||
if wantVSock {
|
||||
mc.VSockPath = firecracker.JailerVSockName
|
||||
}
|
||||
mc.KernelImagePath = "/vmlinux"
|
||||
if strings.TrimSpace(req.InitrdPath) != "" {
|
||||
mc.InitrdPath = "/initrd"
|
||||
} else {
|
||||
mc.InitrdPath = ""
|
||||
}
|
||||
mc.Drives = chrootDrives
|
||||
// LogPath stays set so buildProcessRunner's openLogFile captures firecracker
|
||||
// stderr via cmd.Stderr. buildConfig clears sdk.Config.LogPath for jailer
|
||||
// mode to avoid PUT /logger with a host path firecracker can't open.
|
||||
mc.MetricsPath = ""
|
||||
mc.Jailer = &firecracker.JailerOpts{
|
||||
Binary: req.Jailer.Binary,
|
||||
ChrootBaseDir: req.Jailer.ChrootBaseDir,
|
||||
UID: req.Jailer.UID,
|
||||
GID: req.Jailer.GID,
|
||||
}
|
||||
return mc, nil
|
||||
}
|
||||
|
||||
func (o *localPrivilegedOps) exposeJailerSockets(req roothelper.FirecrackerLaunchRequest) error {
|
||||
if req.Jailer == nil {
|
||||
return nil
|
||||
}
|
||||
chrootRoot := firecracker.JailerChrootRoot(req.Jailer.ChrootBaseDir, req.VMID)
|
||||
hostAPI := filepath.Join(chrootRoot, strings.TrimPrefix(firecracker.JailerSocketName, "/"))
|
||||
if err := atomicSymlink(hostAPI, req.SocketPath); err != nil {
|
||||
return err
|
||||
}
|
||||
if strings.TrimSpace(req.VSockPath) != "" {
|
||||
if err := o.EnsureSocketAccess(ctx, req.VSockPath, "firecracker vsock socket"); err != nil {
|
||||
return 0, err
|
||||
hostVSock := filepath.Join(chrootRoot, strings.TrimPrefix(firecracker.JailerVSockName, "/"))
|
||||
if err := atomicSymlink(hostVSock, req.VSockPath); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
pid := o.fc().ResolvePID(context.Background(), machine, req.SocketPath)
|
||||
if pid <= 0 {
|
||||
return 0, errors.New("firecracker started but pid could not be resolved")
|
||||
return nil
|
||||
}
|
||||
|
||||
// chrootDriveName mirrors the helper-side helper of the same name; kept as
|
||||
// a free function so both paths produce identical chroot layouts.
|
||||
func chrootDriveName(d firecracker.DriveConfig) string {
|
||||
if id := strings.TrimSpace(d.ID); id != "" {
|
||||
return id
|
||||
}
|
||||
return pid, nil
|
||||
return filepath.Base(d.Path)
|
||||
}
|
||||
|
||||
func atomicSymlink(target, link string) error {
|
||||
if err := os.Remove(link); err != nil && !os.IsNotExist(err) {
|
||||
return err
|
||||
}
|
||||
return os.Symlink(target, link)
|
||||
}
|
||||
|
||||
func (o *localPrivilegedOps) EnsureSocketAccess(ctx context.Context, socketPath, label string) error {
|
||||
|
|
@ -246,6 +380,10 @@ func (o *localPrivilegedOps) ProcessRunning(_ context.Context, pid int, apiSock
|
|||
return system.ProcessRunning(pid, apiSock), nil
|
||||
}
|
||||
|
||||
func (o *localPrivilegedOps) CleanupJailerChroot(ctx context.Context, chrootRoot string) error {
|
||||
return o.fc().CleanupJailerChroot(ctx, chrootRoot)
|
||||
}
|
||||
|
||||
func (o *localPrivilegedOps) fc() *fcproc.Manager {
|
||||
return fcproc.New(o.runner, fcproc.Config{
|
||||
FirecrackerBin: normalizeFirecrackerBinary("", o.config.FirecrackerBin),
|
||||
|
|
@ -320,7 +458,27 @@ func (o *helperPrivilegedOps) ResolveFirecrackerBinary(ctx context.Context, requ
|
|||
|
||||
func (o *helperPrivilegedOps) LaunchFirecracker(ctx context.Context, req roothelper.FirecrackerLaunchRequest) (int, error) {
|
||||
req.Network = o.networkConfig()
|
||||
return o.client.LaunchFirecracker(ctx, req)
|
||||
pid, err := o.client.LaunchFirecracker(ctx, req)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
// The root helper runs with PrivateMounts=yes, so symlinks it creates
|
||||
// (exposeJailerSockets) are invisible to the daemon's namespace. Re-create
|
||||
// them here so the daemon can reach the API and vsock sockets.
|
||||
if req.Jailer != nil {
|
||||
chrootRoot := firecracker.JailerChrootRoot(req.Jailer.ChrootBaseDir, req.VMID)
|
||||
hostAPI := filepath.Join(chrootRoot, strings.TrimPrefix(firecracker.JailerSocketName, "/"))
|
||||
if err := atomicSymlink(hostAPI, req.SocketPath); err != nil {
|
||||
return 0, fmt.Errorf("api socket symlink: %w", err)
|
||||
}
|
||||
if strings.TrimSpace(req.VSockPath) != "" {
|
||||
hostVSock := filepath.Join(chrootRoot, strings.TrimPrefix(firecracker.JailerVSockName, "/"))
|
||||
if err := atomicSymlink(hostVSock, req.VSockPath); err != nil {
|
||||
return 0, fmt.Errorf("vsock symlink: %w", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
return pid, nil
|
||||
}
|
||||
|
||||
func (o *helperPrivilegedOps) EnsureSocketAccess(ctx context.Context, socketPath, label string) error {
|
||||
|
|
@ -348,6 +506,10 @@ func (o *helperPrivilegedOps) ProcessRunning(ctx context.Context, pid int, apiSo
|
|||
return o.client.ProcessRunning(ctx, pid, apiSock)
|
||||
}
|
||||
|
||||
func (o *helperPrivilegedOps) CleanupJailerChroot(ctx context.Context, chrootRoot string) error {
|
||||
return o.client.CleanupJailerChroot(ctx, chrootRoot)
|
||||
}
|
||||
|
||||
func (o *helperPrivilegedOps) networkConfig() roothelper.NetworkConfig {
|
||||
return roothelper.NetworkConfig{
|
||||
BridgeName: o.config.BridgeName,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue