firecracker: adopt firecracker-jailer for VM launch (Phase B)

Each VM's firecracker now runs inside a per-VM chroot dropped to the
registered owner UID via firecracker-jailer. Closes the broad ambient-
sudo escalation surface that survived Phase A: the helper still needs
caps for tap/bridge/dm/loop/iptables, but the VMM itself no longer
runs as root in the host root filesystem.

The host helper stages each chroot up front: hard-links the kernel
and (optional) initrd, mknods block-device drives + /dev/vhost-vsock,
copies in the firecracker binary (jailer opens it O_RDWR so a ro bind
fails with EROFS), and bind-mounts /usr/lib + /lib trees read-only so
the dynamic linker can resolve. Self-binds the chroot first so the
findmnt-guarded cleanup can recurse safely.

AF_UNIX sun_path is 108 bytes; the chroot path easily blows past that.
Daemon-side launch pre-symlinks the short request socket path to the
long chroot socket before Machine.Start so the SDK's poll/connect
sees the short path while the kernel resolves to the chroot socket.
--new-pid-ns is intentionally disabled — jailer's PID-namespace fork
makes the SDK see the parent exit and tear the API socket down too
early.

CapabilityBoundingSet for the helper expands to add CAP_FOWNER,
CAP_KILL, CAP_MKNOD, CAP_SETGID, CAP_SETUID, CAP_SYS_CHROOT alongside
the existing CAP_CHOWN/CAP_DAC_OVERRIDE/CAP_NET_ADMIN/CAP_NET_RAW/
CAP_SYS_ADMIN.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Thales Maciel 2026-04-28 14:38:07 -03:00
parent d73efe6fbc
commit 6b543cb17f
No known key found for this signature in database
GPG key ID: 33112E6833C34679
12 changed files with 864 additions and 56 deletions

View file

@ -47,6 +47,7 @@ const (
methodKillProcess = "priv.kill_process"
methodSignalProcess = "priv.signal_process"
methodProcessRunning = "priv.process_running"
methodCleanupJailerChroot = "priv.cleanup_jailer_chroot"
rootfsDMNamePrefix = "fc-rootfs-"
vmTapPrefix = "tap-fc-"
tapPoolPrefix = "tap-pool-"
@ -82,6 +83,18 @@ type FirecrackerLaunchRequest struct {
VCPUCount int `json:"vcpu_count"`
MemoryMiB int `json:"memory_mib"`
Network NetworkConfig `json:"network"`
Jailer *JailerLaunchOpts `json:"jailer,omitempty"`
}
// JailerLaunchOpts mirrors firecracker.JailerOpts for the RPC wire. UID
// and GID are the (un)privileged target the jailer drops to; the helper
// enforces they match the registered owner so the daemon can't ask the
// helper to run firecracker as an arbitrary user.
type JailerLaunchOpts struct {
Binary string `json:"binary"`
ChrootBaseDir string `json:"chroot_base_dir"`
UID int `json:"uid"`
GID int `json:"gid"`
}
type findPIDResult struct {
@ -220,6 +233,13 @@ func (c *Client) LaunchFirecracker(ctx context.Context, req FirecrackerLaunchReq
return result.PID, nil
}
func (c *Client) CleanupJailerChroot(ctx context.Context, chrootRoot string) error {
_, err := rpc.Call[struct{}](ctx, c.socketPath, methodCleanupJailerChroot, struct {
ChrootRoot string `json:"chroot_root"`
}{ChrootRoot: chrootRoot})
return err
}
func (c *Client) EnsureSocketAccess(ctx context.Context, socketPath, label string) error {
_, err := rpc.Call[struct{}](ctx, c.socketPath, methodEnsureSocketAccess, struct {
SocketPath string `json:"socket_path"`
@ -589,6 +609,19 @@ func (s *Server) dispatch(ctx context.Context, req rpc.Request) rpc.Response {
return rpc.NewError("bad_params", err.Error())
}
return marshalResultOrError(processRunningResult{Running: system.ProcessRunning(params.PID, params.APISock)}, nil)
case methodCleanupJailerChroot:
params, err := rpc.DecodeParams[struct {
ChrootRoot string `json:"chroot_root"`
}](req)
if err != nil {
return rpc.NewError("bad_params", err.Error())
}
systemLayout := paths.ResolveSystem()
if err := s.validateManagedPath(params.ChrootRoot, systemLayout.StateDir, systemLayout.RuntimeDir); err != nil {
return rpc.NewError("invalid_path", err.Error())
}
err = fcproc.New(s.runner, fcproc.Config{}, s.logger).CleanupJailerChroot(ctx, params.ChrootRoot)
return marshalResultOrError(struct{}{}, err)
default:
return rpc.NewError("unknown_method", req.Method)
}
@ -718,7 +751,59 @@ func (s *Server) launchFirecracker(ctx context.Context, req FirecrackerLaunchReq
return 0, err
}
}
machine, err := firecracker.NewMachine(ctx, firecracker.MachineConfig{
mgr := fcproc.New(s.runner, fcproc.Config{BridgeName: req.Network.BridgeName, BridgeIP: req.Network.BridgeIP, CIDR: req.Network.CIDR}, s.logger)
mc, err := s.buildLaunchMachineConfig(ctx, req, systemLayout, mgr)
if err != nil {
return 0, err
}
// Pre-Start symlink: see localPrivilegedOps.LaunchFirecracker for
// the AF_UNIX sun_path-length rationale.
if err := s.exposeJailerSockets(req); err != nil {
return 0, fmt.Errorf("expose jailer sockets: %w", err)
}
machine, err := firecracker.NewMachine(ctx, mc)
if err != nil {
return 0, err
}
if err := machine.Start(ctx); err != nil {
if pid := mgr.ResolvePID(context.Background(), machine, mc.SocketPath); pid > 0 {
_, _ = s.runner.Run(context.Background(), "kill", "-KILL", strconv.Itoa(pid))
}
return 0, err
}
if req.Jailer == nil {
// Belt-and-suspenders only on the legacy direct-firecracker path;
// the jailer drops to the configured uid before creating the
// socket, so its perms are correct by construction.
if err := mgr.EnsureSocketAccessFor(ctx, mc.SocketPath, "firecracker api socket", s.meta.OwnerUID, s.meta.OwnerGID); err != nil {
return 0, err
}
if strings.TrimSpace(mc.VSockPath) != "" {
if err := mgr.EnsureSocketAccessFor(ctx, mc.VSockPath, "firecracker vsock socket", s.meta.OwnerUID, s.meta.OwnerGID); err != nil {
return 0, err
}
}
}
pid := mgr.ResolvePID(context.Background(), machine, mc.SocketPath)
if pid <= 0 {
return 0, errors.New("firecracker started but pid could not be resolved")
}
return pid, nil
}
// buildLaunchMachineConfig assembles the firecracker.MachineConfig used by
// launchFirecracker, performing the chroot staging when jailer is enabled.
// In the non-jailer case it's a straight field copy from the request.
//
// In the jailer case it:
// - validates JailerLaunchOpts (binary executable, chroot under RuntimeDir,
// uid/gid match the registered owner — the daemon can't ask the helper to
// drop firecracker into an arbitrary uid)
// - calls fcproc.PrepareJailerChroot to build the chroot tree
// - rewrites SocketPath and VSockPath to host-visible chroot paths and
// KernelImagePath/InitrdPath/Drives[].Path to chroot-internal names
func (s *Server) buildLaunchMachineConfig(ctx context.Context, req FirecrackerLaunchRequest, layout paths.Layout, mgr *fcproc.Manager) (firecracker.MachineConfig, error) {
mc := firecracker.MachineConfig{
BinaryPath: req.BinaryPath,
VMID: req.VMID,
SocketPath: req.SocketPath,
@ -734,31 +819,120 @@ func (s *Server) launchFirecracker(ctx context.Context, req FirecrackerLaunchReq
VCPUCount: req.VCPUCount,
MemoryMiB: req.MemoryMiB,
Logger: s.logger,
})
if err != nil {
return 0, err
}
if err := machine.Start(ctx); err != nil {
manager := fcproc.New(s.runner, fcproc.Config{BridgeName: req.Network.BridgeName, BridgeIP: req.Network.BridgeIP, CIDR: req.Network.CIDR}, s.logger)
if pid := manager.ResolvePID(context.Background(), machine, req.SocketPath); pid > 0 {
_, _ = s.runner.Run(context.Background(), "kill", "-KILL", strconv.Itoa(pid))
}
return 0, err
if req.Jailer == nil {
return mc, nil
}
manager := fcproc.New(s.runner, fcproc.Config{BridgeName: req.Network.BridgeName, BridgeIP: req.Network.BridgeIP, CIDR: req.Network.CIDR}, s.logger)
if err := manager.EnsureSocketAccessFor(ctx, req.SocketPath, "firecracker api socket", s.meta.OwnerUID, s.meta.OwnerGID); err != nil {
return 0, err
if err := s.validateJailerOpts(*req.Jailer, layout); err != nil {
return firecracker.MachineConfig{}, err
}
chrootRoot := firecracker.JailerChrootRoot(req.Jailer.ChrootBaseDir, req.VMID)
driveSpecs := make([]fcproc.ChrootDriveSpec, 0, len(req.Drives))
chrootDrives := make([]firecracker.DriveConfig, 0, len(req.Drives))
for _, d := range req.Drives {
name := chrootDriveName(d)
driveSpecs = append(driveSpecs, fcproc.ChrootDriveSpec{ChrootName: name, HostPath: d.Path})
chrootDrives = append(chrootDrives, firecracker.DriveConfig{
ID: d.ID,
Path: "/" + name,
ReadOnly: d.ReadOnly,
IsRoot: d.IsRoot,
})
}
wantVSock := strings.TrimSpace(req.VSockPath) != ""
if err := mgr.PrepareJailerChroot(ctx, chrootRoot,
req.Jailer.UID, req.Jailer.GID,
req.BinaryPath,
req.KernelImagePath, "vmlinux",
req.InitrdPath, "initrd",
driveSpecs, wantVSock,
); err != nil {
return firecracker.MachineConfig{}, fmt.Errorf("prepare jailer chroot: %w", err)
}
// See localPrivilegedOps.buildLaunchMachineConfig for why SocketPath
// stays the short req path but VSockPath becomes chroot-internal.
_ = chrootRoot
if wantVSock {
mc.VSockPath = firecracker.JailerVSockName
}
mc.KernelImagePath = "/vmlinux"
if strings.TrimSpace(req.InitrdPath) != "" {
mc.InitrdPath = "/initrd"
} else {
mc.InitrdPath = ""
}
mc.Drives = chrootDrives
// LogPath stays set so buildProcessRunner's openLogFile captures firecracker
// stderr via cmd.Stderr. buildConfig clears sdk.Config.LogPath for jailer
// mode to avoid PUT /logger with a host path firecracker can't open.
mc.MetricsPath = ""
mc.Jailer = &firecracker.JailerOpts{
Binary: req.Jailer.Binary,
ChrootBaseDir: req.Jailer.ChrootBaseDir,
UID: req.Jailer.UID,
GID: req.Jailer.GID,
}
return mc, nil
}
func (s *Server) validateJailerOpts(opts JailerLaunchOpts, layout paths.Layout) error {
if err := validateRootExecutable(opts.Binary); err != nil {
return fmt.Errorf("jailer binary: %w", err)
}
// Chroot base must live under StateDir so hard-links into the chroot
// share a filesystem with the image cache (RuntimeDir is tmpfs and
// would EXDEV on os.Link). RuntimeDir is also accepted because the
// jailer is happy on tmpfs when the kernel/drives happen to colocate
// (e.g. tests).
if err := s.validateManagedPath(opts.ChrootBaseDir, layout.StateDir, layout.RuntimeDir); err != nil {
return fmt.Errorf("jailer chroot base: %w", err)
}
if opts.UID != s.meta.OwnerUID || opts.GID != s.meta.OwnerGID {
return fmt.Errorf("jailer uid/gid (%d:%d) must match registered owner (%d:%d)", opts.UID, opts.GID, s.meta.OwnerUID, s.meta.OwnerGID)
}
return nil
}
// exposeJailerSockets makes the chroot-internal sockets reachable at the
// host paths the daemon already references (sc.apiSock, vm.Runtime.VSockPath).
// AF_UNIX connect(2) follows symlinks, so a symlink keeps the rest of the
// daemon code unchanged. Computes both host targets from the chroot root and
// the chroot-internal name, so the API socket and the vsock socket stay in
// sync regardless of how the launch request laid them out.
func (s *Server) exposeJailerSockets(req FirecrackerLaunchRequest) error {
if req.Jailer == nil {
return nil
}
chrootRoot := firecracker.JailerChrootRoot(req.Jailer.ChrootBaseDir, req.VMID)
hostAPI := filepath.Join(chrootRoot, strings.TrimPrefix(firecracker.JailerSocketName, "/"))
if err := atomicSymlink(hostAPI, req.SocketPath); err != nil {
return fmt.Errorf("api socket symlink: %w", err)
}
if strings.TrimSpace(req.VSockPath) != "" {
if err := manager.EnsureSocketAccessFor(ctx, req.VSockPath, "firecracker vsock socket", s.meta.OwnerUID, s.meta.OwnerGID); err != nil {
return 0, err
hostVSock := filepath.Join(chrootRoot, strings.TrimPrefix(firecracker.JailerVSockName, "/"))
if err := atomicSymlink(hostVSock, req.VSockPath); err != nil {
return fmt.Errorf("vsock symlink: %w", err)
}
}
pid := manager.ResolvePID(context.Background(), machine, req.SocketPath)
if pid <= 0 {
return 0, errors.New("firecracker started but pid could not be resolved")
return nil
}
func atomicSymlink(target, link string) error {
if err := os.Remove(link); err != nil && !os.IsNotExist(err) {
return err
}
return pid, nil
return os.Symlink(target, link)
}
// chrootDriveName returns the bare filename a drive should appear as inside
// the chroot. We use the drive ID when present (rootfs, work, …) so the
// chroot listing is self-explanatory; falling back to the source's basename
// covers the unnamed case.
func chrootDriveName(d firecracker.DriveConfig) string {
if id := strings.TrimSpace(d.ID); id != "" {
return id
}
return filepath.Base(d.Path)
}
func (s *Server) validateLaunchDrivePath(drive firecracker.DriveConfig, stateDir string) error {