From 6b543cb17ff40a9ed3dca76a2e919993947d7c6a Mon Sep 17 00:00:00 2001 From: Thales Maciel Date: Tue, 28 Apr 2026 14:38:07 -0300 Subject: [PATCH] firecracker: adopt firecracker-jailer for VM launch (Phase B) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Each VM's firecracker now runs inside a per-VM chroot dropped to the registered owner UID via firecracker-jailer. Closes the broad ambient- sudo escalation surface that survived Phase A: the helper still needs caps for tap/bridge/dm/loop/iptables, but the VMM itself no longer runs as root in the host root filesystem. The host helper stages each chroot up front: hard-links the kernel and (optional) initrd, mknods block-device drives + /dev/vhost-vsock, copies in the firecracker binary (jailer opens it O_RDWR so a ro bind fails with EROFS), and bind-mounts /usr/lib + /lib trees read-only so the dynamic linker can resolve. Self-binds the chroot first so the findmnt-guarded cleanup can recurse safely. AF_UNIX sun_path is 108 bytes; the chroot path easily blows past that. Daemon-side launch pre-symlinks the short request socket path to the long chroot socket before Machine.Start so the SDK's poll/connect sees the short path while the kernel resolves to the chroot socket. --new-pid-ns is intentionally disabled — jailer's PID-namespace fork makes the SDK see the parent exit and tear the API socket down too early. CapabilityBoundingSet for the helper expands to add CAP_FOWNER, CAP_KILL, CAP_MKNOD, CAP_SETGID, CAP_SETUID, CAP_SYS_CHROOT alongside the existing CAP_CHOWN/CAP_DAC_OVERRIDE/CAP_NET_ADMIN/CAP_NET_RAW/ CAP_SYS_ADMIN. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/cli/commands_system.go | 2 +- internal/cli/daemon_lifecycle_test.go | 2 +- internal/config/config.go | 19 ++ internal/daemon/daemon.go | 10 + internal/daemon/fcproc/fcproc.go | 304 ++++++++++++++++++++++++++ internal/daemon/privileged_ops.go | 218 +++++++++++++++--- internal/daemon/vm.go | 26 ++- internal/daemon/vm_lifecycle_steps.go | 19 ++ internal/firecracker/client.go | 90 +++++++- internal/model/types.go | 4 + internal/roothelper/roothelper.go | 212 ++++++++++++++++-- internal/system/system.go | 14 +- 12 files changed, 864 insertions(+), 56 deletions(-) diff --git a/internal/cli/commands_system.go b/internal/cli/commands_system.go index a729a2c..50768b0 100644 --- a/internal/cli/commands_system.go +++ b/internal/cli/commands_system.go @@ -367,7 +367,7 @@ func renderRootHelperSystemdUnit() string { "LockPersonality=yes", "SystemCallArchitectures=native", "RestrictAddressFamilies=AF_UNIX AF_INET AF_INET6 AF_NETLINK AF_VSOCK", - "CapabilityBoundingSet=CAP_CHOWN CAP_DAC_OVERRIDE CAP_NET_ADMIN CAP_NET_RAW CAP_SYS_ADMIN", + "CapabilityBoundingSet=CAP_CHOWN CAP_DAC_OVERRIDE CAP_FOWNER CAP_KILL CAP_MKNOD CAP_NET_ADMIN CAP_NET_RAW CAP_SETGID CAP_SETUID CAP_SYS_ADMIN CAP_SYS_CHROOT", "ReadWritePaths=/var/lib/banger", "RuntimeDirectory=banger-root", "RuntimeDirectoryMode=0711", diff --git a/internal/cli/daemon_lifecycle_test.go b/internal/cli/daemon_lifecycle_test.go index c050e18..7b946f7 100644 --- a/internal/cli/daemon_lifecycle_test.go +++ b/internal/cli/daemon_lifecycle_test.go @@ -183,7 +183,7 @@ func TestRenderRootHelperSystemdUnitIncludesRequiredCapabilities(t *testing.T) { "ProtectSystem=strict", "ProtectHome=yes", "RestrictAddressFamilies=AF_UNIX AF_INET AF_INET6 AF_NETLINK AF_VSOCK", - "CapabilityBoundingSet=CAP_CHOWN CAP_DAC_OVERRIDE CAP_NET_ADMIN CAP_NET_RAW CAP_SYS_ADMIN", + "CapabilityBoundingSet=CAP_CHOWN CAP_DAC_OVERRIDE CAP_FOWNER CAP_KILL CAP_MKNOD CAP_NET_ADMIN CAP_NET_RAW CAP_SETGID CAP_SETUID CAP_SYS_ADMIN CAP_SYS_CHROOT", "ReadWritePaths=/var/lib/banger", "RuntimeDirectory=banger-root", "RuntimeDirectoryMode=0711", diff --git a/internal/config/config.go b/internal/config/config.go index 700c01a..48670cd 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -22,6 +22,9 @@ import ( type fileConfig struct { LogLevel string `toml:"log_level"` FirecrackerBin string `toml:"firecracker_bin"` + JailerBin string `toml:"jailer_bin"` + JailerEnabled *bool `toml:"jailer_enabled"` + JailerChrootBase string `toml:"jailer_chroot_base"` SSHKeyPath string `toml:"ssh_key_path"` DefaultImageName string `toml:"default_image_name"` AutoStopStaleAfter string `toml:"auto_stop_stale_after"` @@ -75,6 +78,13 @@ func load(layout paths.Layout, home string, ensureDefaultSSHKey bool) (model.Dae DefaultDNS: model.DefaultDNS, DefaultImageName: "debian-bookworm", HostHomeDir: home, + JailerBin: model.DefaultJailerBinary, + JailerEnabled: true, + // Chroot lives under StateDir (ext4) — not RuntimeDir (tmpfs). + // Hard-linking the kernel and any file-backed drives into the + // chroot requires same-filesystem; images already live under + // StateDir, so colocating the chroot avoids EXDEV. + JailerChrootBase: filepath.Join(layout.StateDir, "jail"), } var file fileConfig @@ -99,6 +109,15 @@ func load(layout paths.Layout, home string, ensureDefaultSSHKey bool) (model.Dae } else if path, err := system.LookupExecutable("firecracker"); err == nil { cfg.FirecrackerBin = path } + if value := strings.TrimSpace(file.JailerBin); value != "" { + cfg.JailerBin = value + } + if file.JailerEnabled != nil { + cfg.JailerEnabled = *file.JailerEnabled + } + if value := strings.TrimSpace(file.JailerChrootBase); value != "" { + cfg.JailerChrootBase = value + } if value := strings.TrimSpace(file.DefaultImageName); value != "" { cfg.DefaultImageName = value } diff --git a/internal/daemon/daemon.go b/internal/daemon/daemon.go index 4cff28b..ca6b7c8 100644 --- a/internal/daemon/daemon.go +++ b/internal/daemon/daemon.go @@ -9,6 +9,8 @@ import ( "log/slog" "net" "os" + "path/filepath" + "strings" "sync" "time" @@ -88,6 +90,14 @@ func OpenSystem(ctx context.Context) (*Daemon, error) { if err != nil { return nil, err } + // config.Load fills JailerChrootBase from the layout it sees. In + // system mode that's the owner's layout (no privileged StateDir) so + // the value lands under the owner home — wrong for the helper, which + // validates paths against the system StateDir. Override unconditionally + // here so both daemon and helper see /var/lib/banger/jail. + if strings.TrimSpace(cfg.JailerChrootBase) == "" || !filepath.IsAbs(cfg.JailerChrootBase) || strings.HasPrefix(cfg.JailerChrootBase, ownerLayout.StateDir) { + cfg.JailerChrootBase = filepath.Join(layout.StateDir, "jail") + } helper := newHelperPrivilegedOps(roothelper.NewClient(installmeta.DefaultRootHelperSocketPath), cfg, layout) return openWithConfig(ctx, layout, ownerLayout, cfg, -1, -1, false, helper) } diff --git a/internal/daemon/fcproc/fcproc.go b/internal/daemon/fcproc/fcproc.go index ae2a0c9..7bd7990 100644 --- a/internal/daemon/fcproc/fcproc.go +++ b/internal/daemon/fcproc/fcproc.go @@ -11,11 +11,15 @@ import ( "fmt" "log/slog" "os" + "path/filepath" "strconv" "strings" "sync" + "syscall" "time" + "golang.org/x/sys/unix" + "banger/internal/firecracker" "banger/internal/system" ) @@ -271,6 +275,306 @@ func (m *Manager) Kill(ctx context.Context, pid int) error { return err } +// ChrootDriveSpec describes how a single drive should appear inside the +// jailer chroot. HostPath is the host-side source (a regular file or a +// /dev/mapper/* block device); ChrootName is the bare filename it should +// be reachable as inside the chroot (e.g. "rootfs"). The DM block device +// case is detected via os.Stat (S_IFBLK) — the helper mknods a matching +// node; everything else is hard-linked. +type ChrootDriveSpec struct { + ChrootName string + HostPath string +} + +// PrepareJailerChroot stages the chroot tree at chrootRoot for the jailer +// to take over on launch. After this call: +// +// - chrootRoot exists, mode 0700, owned by uid:gid. +// - chrootRoot/ is a hard link of kernelHostPath, owned uid:gid. +// - chrootRoot/ is a hard link of initrdHostPath if set. +// - For each drive: a hard link (regular file source) or a freshly +// mknod'd block device with the source's major/minor (DM source). +// - If wantVSock, /dev/vhost-vsock is mknod'd into the chroot so +// firecracker can open it after chroot. +// +// All filesystem mutations go through runner.RunSudo when the caller isn't +// root, so this works in dev (sudo) and system (root helper) modes alike. +// Path components are validated by the caller (roothelper) — this helper +// trusts them. +func (m *Manager) PrepareJailerChroot(ctx context.Context, chrootRoot string, uid, gid int, firecrackerHostPath, kernelHostPath, kernelName, initrdHostPath, initrdName string, drives []ChrootDriveSpec, wantVSock bool) error { + if strings.TrimSpace(chrootRoot) == "" { + return fmt.Errorf("chroot root is required") + } + if err := m.sudo(ctx, "mkdir", "-p", chrootRoot); err != nil { + return fmt.Errorf("create chroot root: %w", err) + } + if err := m.sudo(ctx, "chmod", "0700", chrootRoot); err != nil { + return fmt.Errorf("chmod chroot root: %w", err) + } + if err := m.chown(ctx, chrootRoot, uid, gid); err != nil { + return fmt.Errorf("chown chroot root: %w", err) + } + // The daemon (uid) needs to traverse the intermediate directories to reach + // the sockets firecracker creates inside the chroot. The per-VM dir + // (/firecracker//) is chowned to uid so the daemon can reach + // /root/. The /firecracker/ base and /jail/ dirs get + // world-execute (--x) so any UID can traverse through them without listing + // their contents (the per-VM dirs are still protected by their own mode). + vmDir := filepath.Dir(chrootRoot) + if err := m.chown(ctx, vmDir, uid, gid); err != nil { + return fmt.Errorf("chown vm dir: %w", err) + } + fcBaseDir := filepath.Dir(vmDir) + if err := m.sudo(ctx, "chmod", "0711", fcBaseDir); err != nil { + return fmt.Errorf("chmod firecracker base dir: %w", err) + } + jailBaseDir := filepath.Dir(fcBaseDir) + if err := m.sudo(ctx, "chmod", "0711", jailBaseDir); err != nil { + return fmt.Errorf("chmod jail base dir: %w", err) + } + // Order matters: hard-link the kernel + file-backed drives BEFORE + // the self-bind below. link(2) refuses to cross mount points even + // when the underlying superblock is the same — once chrootRoot is a + // mount point, `ln /var/lib/.../kernel /vmlinux` returns + // EXDEV. + if err := m.linkInto(ctx, chrootRoot, kernelHostPath, kernelName, uid, gid); err != nil { + return fmt.Errorf("link kernel: %w", err) + } + if strings.TrimSpace(initrdHostPath) != "" { + if err := m.linkInto(ctx, chrootRoot, initrdHostPath, initrdName, uid, gid); err != nil { + return fmt.Errorf("link initrd: %w", err) + } + } + for _, d := range drives { + if err := m.stageDrive(ctx, chrootRoot, d, uid, gid); err != nil { + return fmt.Errorf("stage drive %s: %w", d.ChrootName, err) + } + } + if wantVSock { + // The jailer creates /dev inside the chroot, but /dev/vhost-vsock must + // be pre-staged so firecracker can open it after the jailer chroots. + devDir := chrootRoot + "/dev" + if err := m.sudo(ctx, "mkdir", "-p", devDir); err != nil { + return fmt.Errorf("create chroot/dev: %w", err) + } + if err := m.chown(ctx, devDir, uid, gid); err != nil { + return fmt.Errorf("chown chroot/dev: %w", err) + } + if err := m.stageDevice(ctx, chrootRoot, "dev/vhost-vsock", "/dev/vhost-vsock", uid, gid); err != nil { + return fmt.Errorf("stage vhost-vsock: %w", err) + } + } + // Bind firecracker + the host libdirs into the chroot read-only. + // firecracker is dynamically linked (interpreter /lib64/ld-linux-*, + // libc, libgcc), and inside the chroot ENOENT on those is reported + // as "Failed to exec into Firecracker: No such file or directory" — + // the kernel's misleading ENOENT-for-missing-interpreter error. + // + // Done last so the link/mknod steps above don't have to cross the + // self-bind mount boundary (link(2) returns EXDEV at mount edges). + // Self-bind first so CleanupJailerChroot's `umount -lR` can recurse + // from chrootRoot itself; --make-private blocks propagation back to + // the host mount namespace. + // firecracker is copied (not bind-mounted) because jailer opens the + // binary O_RDWR — apparently to seal it or rewrite something — and + // fails with EROFS on a ro-bind. + chrootFC := chrootRoot + "/" + filepath.Base(firecrackerHostPath) + if err := m.sudo(ctx, "cp", "-f", firecrackerHostPath, chrootFC); err != nil { + return fmt.Errorf("copy firecracker into chroot: %w", err) + } + if err := m.sudo(ctx, "chmod", "0755", chrootFC); err != nil { + return fmt.Errorf("chmod firecracker in chroot: %w", err) + } + if err := m.chown(ctx, chrootFC, uid, gid); err != nil { + return fmt.Errorf("chown firecracker in chroot: %w", err) + } + if err := m.sudo(ctx, "mount", "--bind", chrootRoot, chrootRoot); err != nil { + return fmt.Errorf("self-bind chroot: %w", err) + } + // Remount without nosuid: the helper unit's ReadWritePaths binding marks + // /var/lib/banger nosuid, and bind mounts inherit that flag. The jailer + // needs to exec /firecracker as UID 1000, which the kernel denies on a + // nosuid mount when NoNewPrivileges is set on the unit. + if err := m.sudo(ctx, "mount", "-o", "remount,bind,suid", chrootRoot, chrootRoot); err != nil { + return fmt.Errorf("remount chroot suid: %w", err) + } + if err := m.sudo(ctx, "mount", "--make-private", chrootRoot); err != nil { + return fmt.Errorf("make-private chroot: %w", err) + } + // Pre-create /usr with world-traversable permissions. UMask=0077 on the + // helper unit causes plain mkdir to produce 0700 dirs; UID 1000 must be + // able to traverse /usr/ to reach the dynamic linker via lib64 → usr/lib. + if err := m.sudo(ctx, "install", "-d", "-m", "0755", chrootRoot+"/usr"); err != nil { + return fmt.Errorf("create chroot/usr: %w", err) + } + // Bind real libdirs and replicate the host's compat symlinks + // (/lib64 → /usr/lib, etc) inside the chroot so firecracker's + // PT_INTERP path (/lib64/ld-linux-*) resolves to the bound libs. + for _, libDir := range []string{"/usr/lib", "/usr/lib64", "/lib", "/lib64"} { + info, err := os.Lstat(libDir) + if err != nil { + continue + } + target := chrootRoot + libDir + if info.Mode()&os.ModeSymlink != 0 { + link, err := os.Readlink(libDir) + if err != nil { + continue + } + if err := m.sudo(ctx, "ln", "-sfn", link, target); err != nil { + return fmt.Errorf("symlink %s -> %s: %w", target, link, err) + } + continue + } + if !info.IsDir() { + continue + } + if err := m.bindDir(ctx, libDir, target, true); err != nil { + return fmt.Errorf("bind %s: %w", libDir, err) + } + } + return nil +} + +// CleanupJailerChroot tears down a chroot built by PrepareJailerChroot: +// lazy-recursive umount of every mount under (or at) chrootRoot, then a +// findmnt-guarded `rm -rf`. The guard is load-bearing: if any bind mount +// remained, `rm -rf` would descend into the bind source (e.g. /usr/lib) +// and start deleting host files. The umount runs `-l` (lazy) so an in-use +// bind point still gets detached from the namespace; the guarded check +// then catches the rare case where detachment didn't happen. +func (m *Manager) CleanupJailerChroot(ctx context.Context, chrootRoot string) error { + if strings.TrimSpace(chrootRoot) == "" { + return nil + } + if _, err := os.Stat(chrootRoot); os.IsNotExist(err) { + return nil + } + // Best-effort umount: for chroots that were never bind-mounted (a + // stale install pre-bind-mount work, say) this fails — that's fine, + // the findmnt guard below is what enforces safety. + _ = m.sudoIgnore(ctx, "umount", "--recursive", "--lazy", chrootRoot) + if mounts, err := m.mountsUnder(ctx, chrootRoot); err != nil { + return fmt.Errorf("inspect chroot mounts: %w", err) + } else if len(mounts) > 0 { + return fmt.Errorf("refusing to rm -rf %q: still has %d mount(s): %v", chrootRoot, len(mounts), mounts) + } + return m.sudo(ctx, "rm", "-rf", "--", chrootRoot) +} + +func (m *Manager) sudoIgnore(ctx context.Context, name string, args ...string) error { + err := m.sudo(ctx, name, args...) + return err +} + +func (m *Manager) bindFile(ctx context.Context, source, target string, readOnly bool) error { + if err := m.sudo(ctx, "install", "-D", "-m", "0644", "/dev/null", target); err != nil { + return fmt.Errorf("create bind target file: %w", err) + } + return m.bindMount(ctx, source, target, readOnly) +} + +func (m *Manager) bindDir(ctx context.Context, source, target string, readOnly bool) error { + if err := m.sudo(ctx, "mkdir", "-p", target); err != nil { + return fmt.Errorf("create bind target dir: %w", err) + } + return m.bindMount(ctx, source, target, readOnly) +} + +func (m *Manager) bindMount(ctx context.Context, source, target string, readOnly bool) error { + if err := m.sudo(ctx, "mount", "--bind", source, target); err != nil { + return err + } + if !readOnly { + return nil + } + // Single-step ro bind isn't honored by all kernels — the bind happens + // rw and the ro flag is silently ignored. Remount makes it stick. + return m.sudo(ctx, "mount", "-o", "remount,bind,ro", target) +} + +// mountsUnder returns the list of mount targets at or under chrootRoot. +// findmnt's output is one path per line; an empty list means no leftovers. +func (m *Manager) mountsUnder(ctx context.Context, chrootRoot string) ([]string, error) { + out, err := m.runner.Run(ctx, "findmnt", "--output", "TARGET", "--list", "--noheadings") + if err != nil { + return nil, err + } + var mounts []string + prefix := chrootRoot + string(os.PathSeparator) + for _, line := range strings.Split(string(out), "\n") { + t := strings.TrimSpace(line) + if t == chrootRoot || strings.HasPrefix(t, prefix) { + mounts = append(mounts, t) + } + } + return mounts, nil +} + +func (m *Manager) stageDrive(ctx context.Context, chrootRoot string, d ChrootDriveSpec, uid, gid int) error { + info, err := os.Stat(d.HostPath) + if err != nil { + return err + } + if info.Mode()&os.ModeDevice != 0 { + stat, ok := info.Sys().(*syscall.Stat_t) + if !ok { + return fmt.Errorf("stat %s: cannot read device numbers", d.HostPath) + } + major := unix.Major(stat.Rdev) + minor := unix.Minor(stat.Rdev) + return m.mknodBlock(ctx, chrootRoot, d.ChrootName, major, minor, uid, gid) + } + return m.linkInto(ctx, chrootRoot, d.HostPath, d.ChrootName, uid, gid) +} + +func (m *Manager) stageDevice(ctx context.Context, chrootRoot, chrootName, hostDevice string, uid, gid int) error { + info, err := os.Stat(hostDevice) + if err != nil { + return err + } + stat, ok := info.Sys().(*syscall.Stat_t) + if !ok { + return fmt.Errorf("stat %s: cannot read device numbers", hostDevice) + } + major := unix.Major(stat.Rdev) + minor := unix.Minor(stat.Rdev) + target := chrootRoot + "/" + chrootName + if err := m.sudo(ctx, "mknod", "-m", "0660", target, "c", strconv.FormatUint(uint64(major), 10), strconv.FormatUint(uint64(minor), 10)); err != nil { + return err + } + return m.chown(ctx, target, uid, gid) +} + +func (m *Manager) mknodBlock(ctx context.Context, chrootRoot, name string, major, minor uint32, uid, gid int) error { + target := chrootRoot + "/" + name + if err := m.sudo(ctx, "mknod", "-m", "0660", target, "b", strconv.FormatUint(uint64(major), 10), strconv.FormatUint(uint64(minor), 10)); err != nil { + return err + } + return m.chown(ctx, target, uid, gid) +} + +func (m *Manager) linkInto(ctx context.Context, chrootRoot, source, name string, uid, gid int) error { + target := chrootRoot + "/" + name + if err := m.sudo(ctx, "ln", "-f", source, target); err != nil { + return err + } + return m.chown(ctx, target, uid, gid) +} + +func (m *Manager) chown(ctx context.Context, target string, uid, gid int) error { + return m.sudo(ctx, "chown", fmt.Sprintf("%d:%d", uid, gid), target) +} + +func (m *Manager) sudo(ctx context.Context, name string, args ...string) error { + if os.Geteuid() == 0 { + _, err := m.runner.Run(ctx, name, args...) + return err + } + _, err := m.runner.RunSudo(ctx, append([]string{name}, args...)...) + return err +} + func waitForPath(ctx context.Context, path string, timeout time.Duration, label string) error { return pollPath(ctx, path, timeout, 100*time.Millisecond, label) } diff --git a/internal/daemon/privileged_ops.go b/internal/daemon/privileged_ops.go index 11d2411..6d498c6 100644 --- a/internal/daemon/privileged_ops.go +++ b/internal/daemon/privileged_ops.go @@ -3,8 +3,10 @@ package daemon import ( "context" "errors" + "fmt" "log/slog" "os" + "path/filepath" "strconv" "strings" "syscall" @@ -39,6 +41,7 @@ type privilegedOps interface { KillProcess(context.Context, int) error SignalProcess(context.Context, int, string) error ProcessRunning(context.Context, int, string) (bool, error) + CleanupJailerChroot(context.Context, string) error } type localPrivilegedOps struct { @@ -170,7 +173,77 @@ func (o *localPrivilegedOps) ResolveFirecrackerBinary(_ context.Context, request } func (o *localPrivilegedOps) LaunchFirecracker(ctx context.Context, req roothelper.FirecrackerLaunchRequest) (int, error) { - machine, err := firecracker.NewMachine(ctx, firecracker.MachineConfig{ + mc, err := o.buildLaunchMachineConfig(ctx, req) + if err != nil { + return 0, err + } + // Symlink before Start: with jailer the actual API socket lives at + // `/firecracker.socket` (~120+ bytes — over the AF_UNIX + // sun_path limit of 108). The SDK's waitForSocket and connect(2) + // would EINVAL on the long path. Pre-creating the symlink at the + // short req.SocketPath lets the SDK poll/connect via the short + // path; the kernel only enforces sun_path on the path you pass, + // not on the resolved target. + if err := o.exposeJailerSockets(req); err != nil { + return 0, fmt.Errorf("expose jailer sockets: %w", err) + } + machine, err := firecracker.NewMachine(ctx, mc) + if err != nil { + return 0, err + } + chownDone := o.maybeChownSockets(ctx, req, mc) + startErr := machine.Start(ctx) + chownErr := <-chownDone + if startErr != nil { + if pid := o.fc().ResolvePID(context.Background(), machine, mc.SocketPath); pid > 0 { + _ = o.KillProcess(context.Background(), pid) + } + return 0, startErr + } + if chownErr != nil { + return 0, chownErr + } + if req.Jailer == nil { + // Belt-and-suspenders for the legacy direct-firecracker path. + // The jailer path doesn't need this — firecracker drops to the + // configured uid before creating the socket. + if err := o.EnsureSocketAccess(ctx, mc.SocketPath, "firecracker api socket"); err != nil { + return 0, err + } + if strings.TrimSpace(mc.VSockPath) != "" { + if err := o.EnsureSocketAccess(ctx, mc.VSockPath, "firecracker vsock socket"); err != nil { + return 0, err + } + } + } + pid := o.fc().ResolvePID(context.Background(), machine, mc.SocketPath) + if pid <= 0 { + return 0, errors.New("firecracker started but pid could not be resolved") + } + return pid, nil +} + +// maybeChownSockets runs the post-Start sudo-chown race only on the legacy +// direct-firecracker path. With the jailer the firecracker process is +// already running as the configured uid before it creates the socket, so +// no chown is needed (and chown on the symlink would tweak the symlink's +// metadata — not the target's — anyway). +func (o *localPrivilegedOps) maybeChownSockets(ctx context.Context, req roothelper.FirecrackerLaunchRequest, mc firecracker.MachineConfig) <-chan error { + if req.Jailer != nil { + ch := make(chan error, 1) + ch <- nil + close(ch) + return ch + } + return o.fc().EnsureSocketAccessForAsync(ctx, []string{mc.SocketPath, mc.VSockPath}, o.clientUID, o.clientGID) +} + +// buildLaunchMachineConfig mirrors the helper-side equivalent: when jailer +// is enabled, stage the chroot tree and rewrite the path fields to their +// chroot-translated form (host-visible for sockets, chroot-internal for +// kernel/drives — see firecracker.MachineConfig.Jailer doc). +func (o *localPrivilegedOps) buildLaunchMachineConfig(ctx context.Context, req roothelper.FirecrackerLaunchRequest) (firecracker.MachineConfig, error) { + mc := firecracker.MachineConfig{ BinaryPath: req.BinaryPath, VMID: req.VMID, SocketPath: req.SocketPath, @@ -186,40 +259,101 @@ func (o *localPrivilegedOps) LaunchFirecracker(ctx context.Context, req roothelp VCPUCount: req.VCPUCount, MemoryMiB: req.MemoryMiB, Logger: o.logger, - }) - if err != nil { - return 0, err } - // Race the chown against the SDK's HTTP probe inside Start: when the - // daemon is non-root, firecracker is launched under sudo and the API - // socket appears root-owned. Without a concurrent chown the SDK's - // connect(2) gets EACCES and Start times out before our post-Start - // EnsureSocketAccess can ever run. - chownDone := o.fc().EnsureSocketAccessForAsync(ctx, []string{req.SocketPath, req.VSockPath}, o.clientUID, o.clientGID) - startErr := machine.Start(ctx) - chownErr := <-chownDone - if startErr != nil { - if pid := o.fc().ResolvePID(context.Background(), machine, req.SocketPath); pid > 0 { - _ = o.KillProcess(context.Background(), pid) - } - return 0, startErr + if req.Jailer == nil { + return mc, nil } - if chownErr != nil { - return 0, chownErr + chrootRoot := firecracker.JailerChrootRoot(req.Jailer.ChrootBaseDir, req.VMID) + driveSpecs := make([]fcproc.ChrootDriveSpec, 0, len(req.Drives)) + chrootDrives := make([]firecracker.DriveConfig, 0, len(req.Drives)) + for _, d := range req.Drives { + name := chrootDriveName(d) + driveSpecs = append(driveSpecs, fcproc.ChrootDriveSpec{ChrootName: name, HostPath: d.Path}) + chrootDrives = append(chrootDrives, firecracker.DriveConfig{ + ID: d.ID, + Path: "/" + name, + ReadOnly: d.ReadOnly, + IsRoot: d.IsRoot, + }) } - if err := o.EnsureSocketAccess(ctx, req.SocketPath, "firecracker api socket"); err != nil { - return 0, err + wantVSock := strings.TrimSpace(req.VSockPath) != "" + if err := o.fc().PrepareJailerChroot(ctx, chrootRoot, + req.Jailer.UID, req.Jailer.GID, + req.BinaryPath, + req.KernelImagePath, "vmlinux", + req.InitrdPath, "initrd", + driveSpecs, wantVSock, + ); err != nil { + return firecracker.MachineConfig{}, fmt.Errorf("prepare jailer chroot: %w", err) + } + // SocketPath stays the short request path: the SDK polls/connects + // to it via os.Stat / net.Dial("unix", ...), and AF_UNIX sun_path + // is hard-capped at 108 bytes — the actual chroot path is well over + // that. exposeJailerSockets pre-creates the req.SocketPath as a + // symlink whose target is the long chroot socket; the kernel only + // enforces sun_path on the path you hand to connect, not on the + // resolved target. + // + // VSockPath, by contrast, is sent to firecracker via the API and + // resolved from inside the chroot, so it must be the chroot-internal + // path. The host-visible vsock socket is reachable via a symlink + // at req.VSockPath, also installed by exposeJailerSockets. + _ = chrootRoot + if wantVSock { + mc.VSockPath = firecracker.JailerVSockName + } + mc.KernelImagePath = "/vmlinux" + if strings.TrimSpace(req.InitrdPath) != "" { + mc.InitrdPath = "/initrd" + } else { + mc.InitrdPath = "" + } + mc.Drives = chrootDrives + // LogPath stays set so buildProcessRunner's openLogFile captures firecracker + // stderr via cmd.Stderr. buildConfig clears sdk.Config.LogPath for jailer + // mode to avoid PUT /logger with a host path firecracker can't open. + mc.MetricsPath = "" + mc.Jailer = &firecracker.JailerOpts{ + Binary: req.Jailer.Binary, + ChrootBaseDir: req.Jailer.ChrootBaseDir, + UID: req.Jailer.UID, + GID: req.Jailer.GID, + } + return mc, nil +} + +func (o *localPrivilegedOps) exposeJailerSockets(req roothelper.FirecrackerLaunchRequest) error { + if req.Jailer == nil { + return nil + } + chrootRoot := firecracker.JailerChrootRoot(req.Jailer.ChrootBaseDir, req.VMID) + hostAPI := filepath.Join(chrootRoot, strings.TrimPrefix(firecracker.JailerSocketName, "/")) + if err := atomicSymlink(hostAPI, req.SocketPath); err != nil { + return err } if strings.TrimSpace(req.VSockPath) != "" { - if err := o.EnsureSocketAccess(ctx, req.VSockPath, "firecracker vsock socket"); err != nil { - return 0, err + hostVSock := filepath.Join(chrootRoot, strings.TrimPrefix(firecracker.JailerVSockName, "/")) + if err := atomicSymlink(hostVSock, req.VSockPath); err != nil { + return err } } - pid := o.fc().ResolvePID(context.Background(), machine, req.SocketPath) - if pid <= 0 { - return 0, errors.New("firecracker started but pid could not be resolved") + return nil +} + +// chrootDriveName mirrors the helper-side helper of the same name; kept as +// a free function so both paths produce identical chroot layouts. +func chrootDriveName(d firecracker.DriveConfig) string { + if id := strings.TrimSpace(d.ID); id != "" { + return id } - return pid, nil + return filepath.Base(d.Path) +} + +func atomicSymlink(target, link string) error { + if err := os.Remove(link); err != nil && !os.IsNotExist(err) { + return err + } + return os.Symlink(target, link) } func (o *localPrivilegedOps) EnsureSocketAccess(ctx context.Context, socketPath, label string) error { @@ -246,6 +380,10 @@ func (o *localPrivilegedOps) ProcessRunning(_ context.Context, pid int, apiSock return system.ProcessRunning(pid, apiSock), nil } +func (o *localPrivilegedOps) CleanupJailerChroot(ctx context.Context, chrootRoot string) error { + return o.fc().CleanupJailerChroot(ctx, chrootRoot) +} + func (o *localPrivilegedOps) fc() *fcproc.Manager { return fcproc.New(o.runner, fcproc.Config{ FirecrackerBin: normalizeFirecrackerBinary("", o.config.FirecrackerBin), @@ -320,7 +458,27 @@ func (o *helperPrivilegedOps) ResolveFirecrackerBinary(ctx context.Context, requ func (o *helperPrivilegedOps) LaunchFirecracker(ctx context.Context, req roothelper.FirecrackerLaunchRequest) (int, error) { req.Network = o.networkConfig() - return o.client.LaunchFirecracker(ctx, req) + pid, err := o.client.LaunchFirecracker(ctx, req) + if err != nil { + return 0, err + } + // The root helper runs with PrivateMounts=yes, so symlinks it creates + // (exposeJailerSockets) are invisible to the daemon's namespace. Re-create + // them here so the daemon can reach the API and vsock sockets. + if req.Jailer != nil { + chrootRoot := firecracker.JailerChrootRoot(req.Jailer.ChrootBaseDir, req.VMID) + hostAPI := filepath.Join(chrootRoot, strings.TrimPrefix(firecracker.JailerSocketName, "/")) + if err := atomicSymlink(hostAPI, req.SocketPath); err != nil { + return 0, fmt.Errorf("api socket symlink: %w", err) + } + if strings.TrimSpace(req.VSockPath) != "" { + hostVSock := filepath.Join(chrootRoot, strings.TrimPrefix(firecracker.JailerVSockName, "/")) + if err := atomicSymlink(hostVSock, req.VSockPath); err != nil { + return 0, fmt.Errorf("vsock symlink: %w", err) + } + } + } + return pid, nil } func (o *helperPrivilegedOps) EnsureSocketAccess(ctx context.Context, socketPath, label string) error { @@ -348,6 +506,10 @@ func (o *helperPrivilegedOps) ProcessRunning(ctx context.Context, pid int, apiSo return o.client.ProcessRunning(ctx, pid, apiSock) } +func (o *helperPrivilegedOps) CleanupJailerChroot(ctx context.Context, chrootRoot string) error { + return o.client.CleanupJailerChroot(ctx, chrootRoot) +} + func (o *helperPrivilegedOps) networkConfig() roothelper.NetworkConfig { return roothelper.NetworkConfig{ BridgeName: o.config.BridgeName, diff --git a/internal/daemon/vm.go b/internal/daemon/vm.go index 09087cb..4551c96 100644 --- a/internal/daemon/vm.go +++ b/internal/daemon/vm.go @@ -10,6 +10,7 @@ import ( "time" "banger/internal/daemon/fcproc" + "banger/internal/firecracker" "banger/internal/model" "banger/internal/namegen" "banger/internal/system" @@ -149,14 +150,35 @@ func (s *VMService) cleanupRuntime(ctx context.Context, vm model.VMRecord, prese if vm.Runtime.VSockPath != "" { _ = os.Remove(vm.Runtime.VSockPath) } + // Remove the jailer chroot tree (kernel hard-links, mknod'd device + // nodes, the chroot root itself). Skipped silently when the jailer + // is disabled or the chroot was never created. We intentionally + // don't gate on JailerEnabled today — old VMs created before the + // flag flipped on still need their chroots removed if any exist. + jailerErr := s.cleanupJailerChroot(ctx, vm) // The handles are only meaningful while the kernel objects exist; // dropping them here keeps the cache in sync with reality even // when the caller forgets to call clearVMHandles explicitly. s.clearVMHandles(vm) if !preserveDisks && vm.Runtime.VMDir != "" { - return errors.Join(waitErr, snapshotErr, featureErr, tapErr, os.RemoveAll(vm.Runtime.VMDir)) + return errors.Join(waitErr, snapshotErr, featureErr, tapErr, jailerErr, os.RemoveAll(vm.Runtime.VMDir)) } - return errors.Join(waitErr, snapshotErr, featureErr, tapErr) + return errors.Join(waitErr, snapshotErr, featureErr, tapErr, jailerErr) +} + +// cleanupJailerChroot removes the per-VM chroot tree if it exists. Returns +// nil silently when the jailer was never enabled or the chroot path can't +// be computed (no JailerChrootBase configured). +func (s *VMService) cleanupJailerChroot(ctx context.Context, vm model.VMRecord) error { + base := strings.TrimSpace(s.config.JailerChrootBase) + if base == "" { + return nil + } + chrootRoot := firecracker.JailerChrootRoot(base, vm.ID) + if _, err := os.Stat(chrootRoot); os.IsNotExist(err) { + return nil + } + return s.privOps().CleanupJailerChroot(ctx, chrootRoot) } func (s *VMService) generateName(ctx context.Context) (string, error) { diff --git a/internal/daemon/vm_lifecycle_steps.go b/internal/daemon/vm_lifecycle_steps.go index 6fcf27f..30f2c02 100644 --- a/internal/daemon/vm_lifecycle_steps.go +++ b/internal/daemon/vm_lifecycle_steps.go @@ -14,6 +14,24 @@ import ( "banger/internal/system" ) +// jailerOpts returns the jailer launch options to bundle in the firecracker +// launch request, or nil when the jailer is disabled or misconfigured. +// nil makes the launch fall back to the legacy direct-firecracker path. +func (s *VMService) jailerOpts() *roothelper.JailerLaunchOpts { + if !s.config.JailerEnabled { + return nil + } + if strings.TrimSpace(s.config.JailerBin) == "" || strings.TrimSpace(s.config.JailerChrootBase) == "" { + return nil + } + return &roothelper.JailerLaunchOpts{ + Binary: s.config.JailerBin, + ChrootBaseDir: s.config.JailerChrootBase, + UID: os.Getuid(), + GID: os.Getgid(), + } +} + // buildKernelArgs assembles the kernel command line for a start. // Direct-boot images (no initrd) get kernel-level IP config so the // network is up before init, plus init= pointing at the universal @@ -344,6 +362,7 @@ func (s *VMService) buildStartSteps(op *operationLog, sc *startContext) []startS VSockCID: sc.vm.Runtime.VSockCID, VCPUCount: sc.vm.Spec.VCPUCount, MemoryMiB: sc.vm.Spec.MemoryMiB, + Jailer: s.jailerOpts(), } machineConfig := firecracker.MachineConfig{Drives: launchReq.Drives} s.capHooks.contributeMachine(&machineConfig, *sc.vm, sc.image) diff --git a/internal/firecracker/client.go b/internal/firecracker/client.go index eacb50b..93a346a 100644 --- a/internal/firecracker/client.go +++ b/internal/firecracker/client.go @@ -6,6 +6,8 @@ import ( "log/slog" "os" "os/exec" + "path/filepath" + "strconv" "strings" "sync" @@ -32,8 +34,34 @@ type MachineConfig struct { VCPUCount int MemoryMiB int Logger *slog.Logger + // Jailer, when non-nil, wraps firecracker in `jailer`. Path fields + // (SocketPath, KernelImagePath, InitrdPath, Drives[].Path, VSockPath) + // MUST be pre-translated by the caller: SocketPath/VSockPath as + // host-visible chroot paths; the rest as chroot-internal paths + // (jailer chroots before exec, so firecracker resolves them inside + // the chroot). + Jailer *JailerOpts } +// JailerOpts captures the jailer-specific knobs. The chroot tree at +// `/firecracker//root/` and the kernel/drive nodes +// inside it must be staged by the caller before NewMachine — this +// package only constructs the launch cmd. +type JailerOpts struct { + Binary string + ChrootBaseDir string + UID int + GID int +} + +// JailerSocketName is the chroot-relative API socket path passed to +// firecracker via --api-sock. Lives at the chroot root (no /run/ subdir +// required) so we don't depend on jailer creating intermediate dirs. +const JailerSocketName = "/firecracker.socket" + +// JailerVSockName mirrors JailerSocketName for the vsock UDS. +const JailerVSockName = "/vsock.sock" + type DriveConfig struct { ID string Path string @@ -74,6 +102,13 @@ func NewMachine(ctx context.Context, cfg MachineConfig) (*Machine, error) { return &Machine{machine: machine, logFile: logFile}, nil } +// JailerChrootRoot returns the host-visible path to the jailer chroot +// root for vmid under base. Mirrors the layout firecracker's jailer +// builds: /firecracker//root. +func JailerChrootRoot(base, vmid string) string { + return filepath.Join(base, "firecracker", vmid, "root") +} + func (m *Machine) Start(ctx context.Context) error { // The caller's ctx is INTENTIONALLY not forwarded to the SDK. // firecracker-go-sdk's startVMM (machine.go) spawns a goroutine @@ -141,7 +176,7 @@ func buildConfig(cfg MachineConfig) sdk.Config { } drives := drivesBuilder.Build() - return sdk.Config{ + out := sdk.Config{ SocketPath: cfg.SocketPath, LogPath: cfg.LogPath, MetricsPath: cfg.MetricsPath, @@ -162,6 +197,18 @@ func buildConfig(cfg MachineConfig) sdk.Config { }, VMID: cfg.VMID, } + if cfg.Jailer != nil { + // The path fields above are already chroot-translated by the + // caller (see MachineConfig.Jailer doc). Skip the SDK's host-side + // existence checks — kernel/drives live inside the chroot, not + // at the paths we report. + out.DisableValidation = true + // LogPath is the host-side file used only for cmd.Stderr capture. + // Clearing it here prevents the SDK from sending PUT /logger with + // a host path that firecracker can't open from inside the chroot. + out.LogPath = "" + } + return out } func buildVsockDevices(cfg MachineConfig) []sdk.VsockDevice { @@ -214,13 +261,26 @@ func defaultDriveID(drive DriveConfig, fallback string) string { // fcproc.EnsureSocketAccessForAsync immediately *before* Machine.Start so the // chown wins the race against the SDK's HTTP probe over the API socket. That // replaces the previous in-shell chown_watcher. +// +// When cfg.Jailer is set, the launch is wrapped by `jailer`. The chroot tree +// MUST already be staged (kernel hard-linked, drives mknod'd, dirs chowned to +// the configured UID:GID) — see fcproc.PrepareJailerChroot. The SDK's own +// JailerCfg path is intentionally bypassed: it cannot mknod block devices and +// does not expose --new-pid-ns. func buildProcessRunner(cfg MachineConfig, logFile *os.File) *exec.Cmd { - args := []string{"--api-sock", cfg.SocketPath, "--id", cfg.VMID} + var bin string + var args []string + if cfg.Jailer != nil { + bin, args = jailerArgs(cfg) + } else { + bin = cfg.BinaryPath + args = []string{"--api-sock", cfg.SocketPath, "--id", cfg.VMID} + } var cmd *exec.Cmd if os.Geteuid() == 0 { - cmd = exec.Command(cfg.BinaryPath, args...) + cmd = exec.Command(bin, args...) } else { - cmd = exec.Command("sudo", append([]string{"-n", "-E", cfg.BinaryPath}, args...)...) + cmd = exec.Command("sudo", append([]string{"-n", "-E", bin}, args...)...) } cmd.Stdin = nil if logFile != nil { @@ -230,6 +290,28 @@ func buildProcessRunner(cfg MachineConfig, logFile *os.File) *exec.Cmd { return cmd } +// jailerArgs returns the (binary, args) tuple for the jailer wrapper. +// firecracker's flags are passed after `--`. --new-pid-ns gives the guest +// VMM its own PID namespace; the SDK's JailerCommandBuilder doesn't expose +// it in v1.0.0, which is the main reason this path doesn't go through +// sdk.Config.JailerCfg. +func jailerArgs(cfg MachineConfig) (string, []string) { + args := []string{ + "--id", cfg.VMID, + "--uid", strconv.Itoa(cfg.Jailer.UID), + "--gid", strconv.Itoa(cfg.Jailer.GID), + "--exec-file", cfg.BinaryPath, + "--chroot-base-dir", cfg.Jailer.ChrootBaseDir, + // "--new-pid-ns": jailer forks when creating the PID namespace; the + // SDK tracks the parent's PID, which exits immediately, causing the + // SDK's "process exited" goroutine to tear down the API socket while + // firecracker is still booting in the child. Left out intentionally. + "--", + "--api-sock", JailerSocketName, + } + return cfg.Jailer.Binary, args +} + func newLogger(base *slog.Logger) *logrus.Entry { logger := logrus.New() logger.SetOutput(io.Discard) diff --git a/internal/model/types.go b/internal/model/types.go index 7be2ffb..61da2e6 100644 --- a/internal/model/types.go +++ b/internal/model/types.go @@ -22,6 +22,7 @@ const ( DefaultStatsPollInterval = 10 * time.Second DefaultStaleSweepInterval = 1 * time.Minute MaxDiskBytes int64 = 128 * 1024 * 1024 * 1024 + DefaultJailerBinary = "/usr/bin/jailer" ) type VMState string @@ -36,6 +37,9 @@ const ( type DaemonConfig struct { LogLevel string FirecrackerBin string + JailerBin string + JailerEnabled bool + JailerChrootBase string SSHKeyPath string HostHomeDir string AutoStopStaleAfter time.Duration diff --git a/internal/roothelper/roothelper.go b/internal/roothelper/roothelper.go index ec3626f..bad286c 100644 --- a/internal/roothelper/roothelper.go +++ b/internal/roothelper/roothelper.go @@ -47,6 +47,7 @@ const ( methodKillProcess = "priv.kill_process" methodSignalProcess = "priv.signal_process" methodProcessRunning = "priv.process_running" + methodCleanupJailerChroot = "priv.cleanup_jailer_chroot" rootfsDMNamePrefix = "fc-rootfs-" vmTapPrefix = "tap-fc-" tapPoolPrefix = "tap-pool-" @@ -82,6 +83,18 @@ type FirecrackerLaunchRequest struct { VCPUCount int `json:"vcpu_count"` MemoryMiB int `json:"memory_mib"` Network NetworkConfig `json:"network"` + Jailer *JailerLaunchOpts `json:"jailer,omitempty"` +} + +// JailerLaunchOpts mirrors firecracker.JailerOpts for the RPC wire. UID +// and GID are the (un)privileged target the jailer drops to; the helper +// enforces they match the registered owner so the daemon can't ask the +// helper to run firecracker as an arbitrary user. +type JailerLaunchOpts struct { + Binary string `json:"binary"` + ChrootBaseDir string `json:"chroot_base_dir"` + UID int `json:"uid"` + GID int `json:"gid"` } type findPIDResult struct { @@ -220,6 +233,13 @@ func (c *Client) LaunchFirecracker(ctx context.Context, req FirecrackerLaunchReq return result.PID, nil } +func (c *Client) CleanupJailerChroot(ctx context.Context, chrootRoot string) error { + _, err := rpc.Call[struct{}](ctx, c.socketPath, methodCleanupJailerChroot, struct { + ChrootRoot string `json:"chroot_root"` + }{ChrootRoot: chrootRoot}) + return err +} + func (c *Client) EnsureSocketAccess(ctx context.Context, socketPath, label string) error { _, err := rpc.Call[struct{}](ctx, c.socketPath, methodEnsureSocketAccess, struct { SocketPath string `json:"socket_path"` @@ -589,6 +609,19 @@ func (s *Server) dispatch(ctx context.Context, req rpc.Request) rpc.Response { return rpc.NewError("bad_params", err.Error()) } return marshalResultOrError(processRunningResult{Running: system.ProcessRunning(params.PID, params.APISock)}, nil) + case methodCleanupJailerChroot: + params, err := rpc.DecodeParams[struct { + ChrootRoot string `json:"chroot_root"` + }](req) + if err != nil { + return rpc.NewError("bad_params", err.Error()) + } + systemLayout := paths.ResolveSystem() + if err := s.validateManagedPath(params.ChrootRoot, systemLayout.StateDir, systemLayout.RuntimeDir); err != nil { + return rpc.NewError("invalid_path", err.Error()) + } + err = fcproc.New(s.runner, fcproc.Config{}, s.logger).CleanupJailerChroot(ctx, params.ChrootRoot) + return marshalResultOrError(struct{}{}, err) default: return rpc.NewError("unknown_method", req.Method) } @@ -718,7 +751,59 @@ func (s *Server) launchFirecracker(ctx context.Context, req FirecrackerLaunchReq return 0, err } } - machine, err := firecracker.NewMachine(ctx, firecracker.MachineConfig{ + mgr := fcproc.New(s.runner, fcproc.Config{BridgeName: req.Network.BridgeName, BridgeIP: req.Network.BridgeIP, CIDR: req.Network.CIDR}, s.logger) + mc, err := s.buildLaunchMachineConfig(ctx, req, systemLayout, mgr) + if err != nil { + return 0, err + } + // Pre-Start symlink: see localPrivilegedOps.LaunchFirecracker for + // the AF_UNIX sun_path-length rationale. + if err := s.exposeJailerSockets(req); err != nil { + return 0, fmt.Errorf("expose jailer sockets: %w", err) + } + machine, err := firecracker.NewMachine(ctx, mc) + if err != nil { + return 0, err + } + if err := machine.Start(ctx); err != nil { + if pid := mgr.ResolvePID(context.Background(), machine, mc.SocketPath); pid > 0 { + _, _ = s.runner.Run(context.Background(), "kill", "-KILL", strconv.Itoa(pid)) + } + return 0, err + } + if req.Jailer == nil { + // Belt-and-suspenders only on the legacy direct-firecracker path; + // the jailer drops to the configured uid before creating the + // socket, so its perms are correct by construction. + if err := mgr.EnsureSocketAccessFor(ctx, mc.SocketPath, "firecracker api socket", s.meta.OwnerUID, s.meta.OwnerGID); err != nil { + return 0, err + } + if strings.TrimSpace(mc.VSockPath) != "" { + if err := mgr.EnsureSocketAccessFor(ctx, mc.VSockPath, "firecracker vsock socket", s.meta.OwnerUID, s.meta.OwnerGID); err != nil { + return 0, err + } + } + } + pid := mgr.ResolvePID(context.Background(), machine, mc.SocketPath) + if pid <= 0 { + return 0, errors.New("firecracker started but pid could not be resolved") + } + return pid, nil +} + +// buildLaunchMachineConfig assembles the firecracker.MachineConfig used by +// launchFirecracker, performing the chroot staging when jailer is enabled. +// In the non-jailer case it's a straight field copy from the request. +// +// In the jailer case it: +// - validates JailerLaunchOpts (binary executable, chroot under RuntimeDir, +// uid/gid match the registered owner — the daemon can't ask the helper to +// drop firecracker into an arbitrary uid) +// - calls fcproc.PrepareJailerChroot to build the chroot tree +// - rewrites SocketPath and VSockPath to host-visible chroot paths and +// KernelImagePath/InitrdPath/Drives[].Path to chroot-internal names +func (s *Server) buildLaunchMachineConfig(ctx context.Context, req FirecrackerLaunchRequest, layout paths.Layout, mgr *fcproc.Manager) (firecracker.MachineConfig, error) { + mc := firecracker.MachineConfig{ BinaryPath: req.BinaryPath, VMID: req.VMID, SocketPath: req.SocketPath, @@ -734,31 +819,120 @@ func (s *Server) launchFirecracker(ctx context.Context, req FirecrackerLaunchReq VCPUCount: req.VCPUCount, MemoryMiB: req.MemoryMiB, Logger: s.logger, - }) - if err != nil { - return 0, err } - if err := machine.Start(ctx); err != nil { - manager := fcproc.New(s.runner, fcproc.Config{BridgeName: req.Network.BridgeName, BridgeIP: req.Network.BridgeIP, CIDR: req.Network.CIDR}, s.logger) - if pid := manager.ResolvePID(context.Background(), machine, req.SocketPath); pid > 0 { - _, _ = s.runner.Run(context.Background(), "kill", "-KILL", strconv.Itoa(pid)) - } - return 0, err + if req.Jailer == nil { + return mc, nil } - manager := fcproc.New(s.runner, fcproc.Config{BridgeName: req.Network.BridgeName, BridgeIP: req.Network.BridgeIP, CIDR: req.Network.CIDR}, s.logger) - if err := manager.EnsureSocketAccessFor(ctx, req.SocketPath, "firecracker api socket", s.meta.OwnerUID, s.meta.OwnerGID); err != nil { - return 0, err + if err := s.validateJailerOpts(*req.Jailer, layout); err != nil { + return firecracker.MachineConfig{}, err + } + chrootRoot := firecracker.JailerChrootRoot(req.Jailer.ChrootBaseDir, req.VMID) + driveSpecs := make([]fcproc.ChrootDriveSpec, 0, len(req.Drives)) + chrootDrives := make([]firecracker.DriveConfig, 0, len(req.Drives)) + for _, d := range req.Drives { + name := chrootDriveName(d) + driveSpecs = append(driveSpecs, fcproc.ChrootDriveSpec{ChrootName: name, HostPath: d.Path}) + chrootDrives = append(chrootDrives, firecracker.DriveConfig{ + ID: d.ID, + Path: "/" + name, + ReadOnly: d.ReadOnly, + IsRoot: d.IsRoot, + }) + } + wantVSock := strings.TrimSpace(req.VSockPath) != "" + if err := mgr.PrepareJailerChroot(ctx, chrootRoot, + req.Jailer.UID, req.Jailer.GID, + req.BinaryPath, + req.KernelImagePath, "vmlinux", + req.InitrdPath, "initrd", + driveSpecs, wantVSock, + ); err != nil { + return firecracker.MachineConfig{}, fmt.Errorf("prepare jailer chroot: %w", err) + } + // See localPrivilegedOps.buildLaunchMachineConfig for why SocketPath + // stays the short req path but VSockPath becomes chroot-internal. + _ = chrootRoot + if wantVSock { + mc.VSockPath = firecracker.JailerVSockName + } + mc.KernelImagePath = "/vmlinux" + if strings.TrimSpace(req.InitrdPath) != "" { + mc.InitrdPath = "/initrd" + } else { + mc.InitrdPath = "" + } + mc.Drives = chrootDrives + // LogPath stays set so buildProcessRunner's openLogFile captures firecracker + // stderr via cmd.Stderr. buildConfig clears sdk.Config.LogPath for jailer + // mode to avoid PUT /logger with a host path firecracker can't open. + mc.MetricsPath = "" + mc.Jailer = &firecracker.JailerOpts{ + Binary: req.Jailer.Binary, + ChrootBaseDir: req.Jailer.ChrootBaseDir, + UID: req.Jailer.UID, + GID: req.Jailer.GID, + } + return mc, nil +} + +func (s *Server) validateJailerOpts(opts JailerLaunchOpts, layout paths.Layout) error { + if err := validateRootExecutable(opts.Binary); err != nil { + return fmt.Errorf("jailer binary: %w", err) + } + // Chroot base must live under StateDir so hard-links into the chroot + // share a filesystem with the image cache (RuntimeDir is tmpfs and + // would EXDEV on os.Link). RuntimeDir is also accepted because the + // jailer is happy on tmpfs when the kernel/drives happen to colocate + // (e.g. tests). + if err := s.validateManagedPath(opts.ChrootBaseDir, layout.StateDir, layout.RuntimeDir); err != nil { + return fmt.Errorf("jailer chroot base: %w", err) + } + if opts.UID != s.meta.OwnerUID || opts.GID != s.meta.OwnerGID { + return fmt.Errorf("jailer uid/gid (%d:%d) must match registered owner (%d:%d)", opts.UID, opts.GID, s.meta.OwnerUID, s.meta.OwnerGID) + } + return nil +} + +// exposeJailerSockets makes the chroot-internal sockets reachable at the +// host paths the daemon already references (sc.apiSock, vm.Runtime.VSockPath). +// AF_UNIX connect(2) follows symlinks, so a symlink keeps the rest of the +// daemon code unchanged. Computes both host targets from the chroot root and +// the chroot-internal name, so the API socket and the vsock socket stay in +// sync regardless of how the launch request laid them out. +func (s *Server) exposeJailerSockets(req FirecrackerLaunchRequest) error { + if req.Jailer == nil { + return nil + } + chrootRoot := firecracker.JailerChrootRoot(req.Jailer.ChrootBaseDir, req.VMID) + hostAPI := filepath.Join(chrootRoot, strings.TrimPrefix(firecracker.JailerSocketName, "/")) + if err := atomicSymlink(hostAPI, req.SocketPath); err != nil { + return fmt.Errorf("api socket symlink: %w", err) } if strings.TrimSpace(req.VSockPath) != "" { - if err := manager.EnsureSocketAccessFor(ctx, req.VSockPath, "firecracker vsock socket", s.meta.OwnerUID, s.meta.OwnerGID); err != nil { - return 0, err + hostVSock := filepath.Join(chrootRoot, strings.TrimPrefix(firecracker.JailerVSockName, "/")) + if err := atomicSymlink(hostVSock, req.VSockPath); err != nil { + return fmt.Errorf("vsock symlink: %w", err) } } - pid := manager.ResolvePID(context.Background(), machine, req.SocketPath) - if pid <= 0 { - return 0, errors.New("firecracker started but pid could not be resolved") + return nil +} + +func atomicSymlink(target, link string) error { + if err := os.Remove(link); err != nil && !os.IsNotExist(err) { + return err } - return pid, nil + return os.Symlink(target, link) +} + +// chrootDriveName returns the bare filename a drive should appear as inside +// the chroot. We use the drive ID when present (rootfs, work, …) so the +// chroot listing is self-explanatory; falling back to the source's basename +// covers the unnamed case. +func chrootDriveName(d firecracker.DriveConfig) string { + if id := strings.TrimSpace(d.ID); id != "" { + return id + } + return filepath.Base(d.Path) } func (s *Server) validateLaunchDrivePath(drive firecracker.DriveConfig, stateDir string) error { diff --git a/internal/system/system.go b/internal/system/system.go index 84a74df..3c4a5ba 100644 --- a/internal/system/system.go +++ b/internal/system/system.go @@ -172,7 +172,19 @@ func ProcessRunning(pid int, apiSock string) bool { return false } cmdline := strings.ReplaceAll(string(data), "\x00", " ") - return strings.Contains(cmdline, "firecracker") && strings.Contains(cmdline, apiSock) + if !strings.Contains(cmdline, "firecracker") { + return false + } + if strings.Contains(cmdline, apiSock) { + return true + } + // Jailer mode: apiSock is a symlink; firecracker's cmdline has the + // chroot-internal path (e.g. "/firecracker.socket"), not the host path. + // Fall back to matching the symlink target's base name. + if target, err := os.Readlink(apiSock); err == nil { + return strings.Contains(cmdline, filepath.Base(target)) + } + return false } type ProcessStats struct {