firecracker: adopt firecracker-jailer for VM launch (Phase B)
Each VM's firecracker now runs inside a per-VM chroot dropped to the registered owner UID via firecracker-jailer. Closes the broad ambient- sudo escalation surface that survived Phase A: the helper still needs caps for tap/bridge/dm/loop/iptables, but the VMM itself no longer runs as root in the host root filesystem. The host helper stages each chroot up front: hard-links the kernel and (optional) initrd, mknods block-device drives + /dev/vhost-vsock, copies in the firecracker binary (jailer opens it O_RDWR so a ro bind fails with EROFS), and bind-mounts /usr/lib + /lib trees read-only so the dynamic linker can resolve. Self-binds the chroot first so the findmnt-guarded cleanup can recurse safely. AF_UNIX sun_path is 108 bytes; the chroot path easily blows past that. Daemon-side launch pre-symlinks the short request socket path to the long chroot socket before Machine.Start so the SDK's poll/connect sees the short path while the kernel resolves to the chroot socket. --new-pid-ns is intentionally disabled — jailer's PID-namespace fork makes the SDK see the parent exit and tear the API socket down too early. CapabilityBoundingSet for the helper expands to add CAP_FOWNER, CAP_KILL, CAP_MKNOD, CAP_SETGID, CAP_SETUID, CAP_SYS_CHROOT alongside the existing CAP_CHOWN/CAP_DAC_OVERRIDE/CAP_NET_ADMIN/CAP_NET_RAW/ CAP_SYS_ADMIN. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
d73efe6fbc
commit
6b543cb17f
12 changed files with 864 additions and 56 deletions
|
|
@ -47,6 +47,7 @@ const (
|
|||
methodKillProcess = "priv.kill_process"
|
||||
methodSignalProcess = "priv.signal_process"
|
||||
methodProcessRunning = "priv.process_running"
|
||||
methodCleanupJailerChroot = "priv.cleanup_jailer_chroot"
|
||||
rootfsDMNamePrefix = "fc-rootfs-"
|
||||
vmTapPrefix = "tap-fc-"
|
||||
tapPoolPrefix = "tap-pool-"
|
||||
|
|
@ -82,6 +83,18 @@ type FirecrackerLaunchRequest struct {
|
|||
VCPUCount int `json:"vcpu_count"`
|
||||
MemoryMiB int `json:"memory_mib"`
|
||||
Network NetworkConfig `json:"network"`
|
||||
Jailer *JailerLaunchOpts `json:"jailer,omitempty"`
|
||||
}
|
||||
|
||||
// JailerLaunchOpts mirrors firecracker.JailerOpts for the RPC wire. UID
|
||||
// and GID are the (un)privileged target the jailer drops to; the helper
|
||||
// enforces they match the registered owner so the daemon can't ask the
|
||||
// helper to run firecracker as an arbitrary user.
|
||||
type JailerLaunchOpts struct {
|
||||
Binary string `json:"binary"`
|
||||
ChrootBaseDir string `json:"chroot_base_dir"`
|
||||
UID int `json:"uid"`
|
||||
GID int `json:"gid"`
|
||||
}
|
||||
|
||||
type findPIDResult struct {
|
||||
|
|
@ -220,6 +233,13 @@ func (c *Client) LaunchFirecracker(ctx context.Context, req FirecrackerLaunchReq
|
|||
return result.PID, nil
|
||||
}
|
||||
|
||||
func (c *Client) CleanupJailerChroot(ctx context.Context, chrootRoot string) error {
|
||||
_, err := rpc.Call[struct{}](ctx, c.socketPath, methodCleanupJailerChroot, struct {
|
||||
ChrootRoot string `json:"chroot_root"`
|
||||
}{ChrootRoot: chrootRoot})
|
||||
return err
|
||||
}
|
||||
|
||||
func (c *Client) EnsureSocketAccess(ctx context.Context, socketPath, label string) error {
|
||||
_, err := rpc.Call[struct{}](ctx, c.socketPath, methodEnsureSocketAccess, struct {
|
||||
SocketPath string `json:"socket_path"`
|
||||
|
|
@ -589,6 +609,19 @@ func (s *Server) dispatch(ctx context.Context, req rpc.Request) rpc.Response {
|
|||
return rpc.NewError("bad_params", err.Error())
|
||||
}
|
||||
return marshalResultOrError(processRunningResult{Running: system.ProcessRunning(params.PID, params.APISock)}, nil)
|
||||
case methodCleanupJailerChroot:
|
||||
params, err := rpc.DecodeParams[struct {
|
||||
ChrootRoot string `json:"chroot_root"`
|
||||
}](req)
|
||||
if err != nil {
|
||||
return rpc.NewError("bad_params", err.Error())
|
||||
}
|
||||
systemLayout := paths.ResolveSystem()
|
||||
if err := s.validateManagedPath(params.ChrootRoot, systemLayout.StateDir, systemLayout.RuntimeDir); err != nil {
|
||||
return rpc.NewError("invalid_path", err.Error())
|
||||
}
|
||||
err = fcproc.New(s.runner, fcproc.Config{}, s.logger).CleanupJailerChroot(ctx, params.ChrootRoot)
|
||||
return marshalResultOrError(struct{}{}, err)
|
||||
default:
|
||||
return rpc.NewError("unknown_method", req.Method)
|
||||
}
|
||||
|
|
@ -718,7 +751,59 @@ func (s *Server) launchFirecracker(ctx context.Context, req FirecrackerLaunchReq
|
|||
return 0, err
|
||||
}
|
||||
}
|
||||
machine, err := firecracker.NewMachine(ctx, firecracker.MachineConfig{
|
||||
mgr := fcproc.New(s.runner, fcproc.Config{BridgeName: req.Network.BridgeName, BridgeIP: req.Network.BridgeIP, CIDR: req.Network.CIDR}, s.logger)
|
||||
mc, err := s.buildLaunchMachineConfig(ctx, req, systemLayout, mgr)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
// Pre-Start symlink: see localPrivilegedOps.LaunchFirecracker for
|
||||
// the AF_UNIX sun_path-length rationale.
|
||||
if err := s.exposeJailerSockets(req); err != nil {
|
||||
return 0, fmt.Errorf("expose jailer sockets: %w", err)
|
||||
}
|
||||
machine, err := firecracker.NewMachine(ctx, mc)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
if err := machine.Start(ctx); err != nil {
|
||||
if pid := mgr.ResolvePID(context.Background(), machine, mc.SocketPath); pid > 0 {
|
||||
_, _ = s.runner.Run(context.Background(), "kill", "-KILL", strconv.Itoa(pid))
|
||||
}
|
||||
return 0, err
|
||||
}
|
||||
if req.Jailer == nil {
|
||||
// Belt-and-suspenders only on the legacy direct-firecracker path;
|
||||
// the jailer drops to the configured uid before creating the
|
||||
// socket, so its perms are correct by construction.
|
||||
if err := mgr.EnsureSocketAccessFor(ctx, mc.SocketPath, "firecracker api socket", s.meta.OwnerUID, s.meta.OwnerGID); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
if strings.TrimSpace(mc.VSockPath) != "" {
|
||||
if err := mgr.EnsureSocketAccessFor(ctx, mc.VSockPath, "firecracker vsock socket", s.meta.OwnerUID, s.meta.OwnerGID); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
}
|
||||
pid := mgr.ResolvePID(context.Background(), machine, mc.SocketPath)
|
||||
if pid <= 0 {
|
||||
return 0, errors.New("firecracker started but pid could not be resolved")
|
||||
}
|
||||
return pid, nil
|
||||
}
|
||||
|
||||
// buildLaunchMachineConfig assembles the firecracker.MachineConfig used by
|
||||
// launchFirecracker, performing the chroot staging when jailer is enabled.
|
||||
// In the non-jailer case it's a straight field copy from the request.
|
||||
//
|
||||
// In the jailer case it:
|
||||
// - validates JailerLaunchOpts (binary executable, chroot under RuntimeDir,
|
||||
// uid/gid match the registered owner — the daemon can't ask the helper to
|
||||
// drop firecracker into an arbitrary uid)
|
||||
// - calls fcproc.PrepareJailerChroot to build the chroot tree
|
||||
// - rewrites SocketPath and VSockPath to host-visible chroot paths and
|
||||
// KernelImagePath/InitrdPath/Drives[].Path to chroot-internal names
|
||||
func (s *Server) buildLaunchMachineConfig(ctx context.Context, req FirecrackerLaunchRequest, layout paths.Layout, mgr *fcproc.Manager) (firecracker.MachineConfig, error) {
|
||||
mc := firecracker.MachineConfig{
|
||||
BinaryPath: req.BinaryPath,
|
||||
VMID: req.VMID,
|
||||
SocketPath: req.SocketPath,
|
||||
|
|
@ -734,31 +819,120 @@ func (s *Server) launchFirecracker(ctx context.Context, req FirecrackerLaunchReq
|
|||
VCPUCount: req.VCPUCount,
|
||||
MemoryMiB: req.MemoryMiB,
|
||||
Logger: s.logger,
|
||||
})
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
if err := machine.Start(ctx); err != nil {
|
||||
manager := fcproc.New(s.runner, fcproc.Config{BridgeName: req.Network.BridgeName, BridgeIP: req.Network.BridgeIP, CIDR: req.Network.CIDR}, s.logger)
|
||||
if pid := manager.ResolvePID(context.Background(), machine, req.SocketPath); pid > 0 {
|
||||
_, _ = s.runner.Run(context.Background(), "kill", "-KILL", strconv.Itoa(pid))
|
||||
}
|
||||
return 0, err
|
||||
if req.Jailer == nil {
|
||||
return mc, nil
|
||||
}
|
||||
manager := fcproc.New(s.runner, fcproc.Config{BridgeName: req.Network.BridgeName, BridgeIP: req.Network.BridgeIP, CIDR: req.Network.CIDR}, s.logger)
|
||||
if err := manager.EnsureSocketAccessFor(ctx, req.SocketPath, "firecracker api socket", s.meta.OwnerUID, s.meta.OwnerGID); err != nil {
|
||||
return 0, err
|
||||
if err := s.validateJailerOpts(*req.Jailer, layout); err != nil {
|
||||
return firecracker.MachineConfig{}, err
|
||||
}
|
||||
chrootRoot := firecracker.JailerChrootRoot(req.Jailer.ChrootBaseDir, req.VMID)
|
||||
driveSpecs := make([]fcproc.ChrootDriveSpec, 0, len(req.Drives))
|
||||
chrootDrives := make([]firecracker.DriveConfig, 0, len(req.Drives))
|
||||
for _, d := range req.Drives {
|
||||
name := chrootDriveName(d)
|
||||
driveSpecs = append(driveSpecs, fcproc.ChrootDriveSpec{ChrootName: name, HostPath: d.Path})
|
||||
chrootDrives = append(chrootDrives, firecracker.DriveConfig{
|
||||
ID: d.ID,
|
||||
Path: "/" + name,
|
||||
ReadOnly: d.ReadOnly,
|
||||
IsRoot: d.IsRoot,
|
||||
})
|
||||
}
|
||||
wantVSock := strings.TrimSpace(req.VSockPath) != ""
|
||||
if err := mgr.PrepareJailerChroot(ctx, chrootRoot,
|
||||
req.Jailer.UID, req.Jailer.GID,
|
||||
req.BinaryPath,
|
||||
req.KernelImagePath, "vmlinux",
|
||||
req.InitrdPath, "initrd",
|
||||
driveSpecs, wantVSock,
|
||||
); err != nil {
|
||||
return firecracker.MachineConfig{}, fmt.Errorf("prepare jailer chroot: %w", err)
|
||||
}
|
||||
// See localPrivilegedOps.buildLaunchMachineConfig for why SocketPath
|
||||
// stays the short req path but VSockPath becomes chroot-internal.
|
||||
_ = chrootRoot
|
||||
if wantVSock {
|
||||
mc.VSockPath = firecracker.JailerVSockName
|
||||
}
|
||||
mc.KernelImagePath = "/vmlinux"
|
||||
if strings.TrimSpace(req.InitrdPath) != "" {
|
||||
mc.InitrdPath = "/initrd"
|
||||
} else {
|
||||
mc.InitrdPath = ""
|
||||
}
|
||||
mc.Drives = chrootDrives
|
||||
// LogPath stays set so buildProcessRunner's openLogFile captures firecracker
|
||||
// stderr via cmd.Stderr. buildConfig clears sdk.Config.LogPath for jailer
|
||||
// mode to avoid PUT /logger with a host path firecracker can't open.
|
||||
mc.MetricsPath = ""
|
||||
mc.Jailer = &firecracker.JailerOpts{
|
||||
Binary: req.Jailer.Binary,
|
||||
ChrootBaseDir: req.Jailer.ChrootBaseDir,
|
||||
UID: req.Jailer.UID,
|
||||
GID: req.Jailer.GID,
|
||||
}
|
||||
return mc, nil
|
||||
}
|
||||
|
||||
func (s *Server) validateJailerOpts(opts JailerLaunchOpts, layout paths.Layout) error {
|
||||
if err := validateRootExecutable(opts.Binary); err != nil {
|
||||
return fmt.Errorf("jailer binary: %w", err)
|
||||
}
|
||||
// Chroot base must live under StateDir so hard-links into the chroot
|
||||
// share a filesystem with the image cache (RuntimeDir is tmpfs and
|
||||
// would EXDEV on os.Link). RuntimeDir is also accepted because the
|
||||
// jailer is happy on tmpfs when the kernel/drives happen to colocate
|
||||
// (e.g. tests).
|
||||
if err := s.validateManagedPath(opts.ChrootBaseDir, layout.StateDir, layout.RuntimeDir); err != nil {
|
||||
return fmt.Errorf("jailer chroot base: %w", err)
|
||||
}
|
||||
if opts.UID != s.meta.OwnerUID || opts.GID != s.meta.OwnerGID {
|
||||
return fmt.Errorf("jailer uid/gid (%d:%d) must match registered owner (%d:%d)", opts.UID, opts.GID, s.meta.OwnerUID, s.meta.OwnerGID)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// exposeJailerSockets makes the chroot-internal sockets reachable at the
|
||||
// host paths the daemon already references (sc.apiSock, vm.Runtime.VSockPath).
|
||||
// AF_UNIX connect(2) follows symlinks, so a symlink keeps the rest of the
|
||||
// daemon code unchanged. Computes both host targets from the chroot root and
|
||||
// the chroot-internal name, so the API socket and the vsock socket stay in
|
||||
// sync regardless of how the launch request laid them out.
|
||||
func (s *Server) exposeJailerSockets(req FirecrackerLaunchRequest) error {
|
||||
if req.Jailer == nil {
|
||||
return nil
|
||||
}
|
||||
chrootRoot := firecracker.JailerChrootRoot(req.Jailer.ChrootBaseDir, req.VMID)
|
||||
hostAPI := filepath.Join(chrootRoot, strings.TrimPrefix(firecracker.JailerSocketName, "/"))
|
||||
if err := atomicSymlink(hostAPI, req.SocketPath); err != nil {
|
||||
return fmt.Errorf("api socket symlink: %w", err)
|
||||
}
|
||||
if strings.TrimSpace(req.VSockPath) != "" {
|
||||
if err := manager.EnsureSocketAccessFor(ctx, req.VSockPath, "firecracker vsock socket", s.meta.OwnerUID, s.meta.OwnerGID); err != nil {
|
||||
return 0, err
|
||||
hostVSock := filepath.Join(chrootRoot, strings.TrimPrefix(firecracker.JailerVSockName, "/"))
|
||||
if err := atomicSymlink(hostVSock, req.VSockPath); err != nil {
|
||||
return fmt.Errorf("vsock symlink: %w", err)
|
||||
}
|
||||
}
|
||||
pid := manager.ResolvePID(context.Background(), machine, req.SocketPath)
|
||||
if pid <= 0 {
|
||||
return 0, errors.New("firecracker started but pid could not be resolved")
|
||||
return nil
|
||||
}
|
||||
|
||||
func atomicSymlink(target, link string) error {
|
||||
if err := os.Remove(link); err != nil && !os.IsNotExist(err) {
|
||||
return err
|
||||
}
|
||||
return pid, nil
|
||||
return os.Symlink(target, link)
|
||||
}
|
||||
|
||||
// chrootDriveName returns the bare filename a drive should appear as inside
|
||||
// the chroot. We use the drive ID when present (rootfs, work, …) so the
|
||||
// chroot listing is self-explanatory; falling back to the source's basename
|
||||
// covers the unnamed case.
|
||||
func chrootDriveName(d firecracker.DriveConfig) string {
|
||||
if id := strings.TrimSpace(d.ID); id != "" {
|
||||
return id
|
||||
}
|
||||
return filepath.Base(d.Path)
|
||||
}
|
||||
|
||||
func (s *Server) validateLaunchDrivePath(drive firecracker.DriveConfig, stateDir string) error {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue