banger/internal/daemon/daemon.go
Thales Maciel 853249dec2
roothelper: tighten input validation across privileged RPCs
Defence-in-depth pass over every helper method that touches the host
as root. Each fix narrows what a compromised owner-uid daemon could
ask the helper to do; many close concrete file-ownership and DoS
primitives that the previous validators didn't reach.

Path / identifier validation:
  * priv.fsck_snapshot now requires /dev/mapper/fc-rootfs-* (was
    "is the string non-empty"). e2fsck -fy on /dev/sda1 was the
    motivating exploit.
  * priv.kill_process and priv.signal_process now read
    /proc/<pid>/cmdline and require a "firecracker" substring before
    sending the signal. Killing arbitrary host PIDs (sshd, init, …)
    is no longer a one-RPC primitive.
  * priv.read_ext4_file and priv.write_ext4_files now require the
    image path to live under StateDir or be /dev/mapper/fc-rootfs-*.
  * priv.cleanup_dm_snapshot validates every non-empty Handles field:
    DM name fc-rootfs-*, DM device /dev/mapper/fc-rootfs-*, loops
    /dev/loopN.
  * priv.remove_dm_snapshot accepts only fc-rootfs-* names or
    /dev/mapper/fc-rootfs-* paths.
  * priv.ensure_nat now requires a parsable IPv4 address and a
    banger-prefixed tap.
  * priv.sync_resolver_routing and priv.clear_resolver_routing now
    require a Linux iface-name-shaped bridge name (1–15 chars, no
    whitespace/'/'/':') and, for sync, a parsable resolver address.

Symlink defence:
  * priv.ensure_socket_access now validates the socket path is under
    RuntimeDir and not a symlink. The fcproc layer's chown/chmod
    moves to unix.Open(O_PATH|O_NOFOLLOW) + Fchownat(AT_EMPTY_PATH)
    + Fchmodat via /proc/self/fd, so even a swap of the leaf into a
    symlink between validation and the syscall is refused. The
    local-priv (non-root) fallback uses `chown -h`.
  * priv.cleanup_jailer_chroot rejects symlinks at both the leaf
    (os.Lstat) and intermediate path components (filepath.EvalSymlinks
    + clean-equality). The umount sweep was rewritten from shell
    `umount --recursive --lazy` to direct unix.Unmount(MNT_DETACH |
    UMOUNT_NOFOLLOW) per child mount, deepest-first; the findmnt
    guard remains as the rm-rf safety net. Local-priv mode falls
    back to `sudo umount --lazy`.

Binary validation:
  * validateRootExecutable now opens with O_PATH|O_NOFOLLOW and
    Fstats through the resulting fd. Rejects path-level symlinks and
    narrows the TOCTOU window between validation and the SDK's exec
    to fork+exec time on a healthy host.

Daemon socket:
  * The owner daemon now reads SO_PEERCRED on every accepted
    connection and refuses any UID that isn't 0 or the registered
    owner. Filesystem perms (0600 + ownerUID) already enforced this;
    the check is belt-and-braces in case the socket FD is ever
    leaked to a non-owner process.

Docs:
  * docs/privileges.md walked end-to-end. Each helper RPC's
    Validation gate row reflects what the code actually enforces.
    New section "Running outside the system install" calls out the
    looser dev-mode trust model (NOPASSWD sudoers, helper hardening
    bypassed) so users don't deploy that path on shared hosts.
    Trust list updated to include every new validator.

Tests added: validators (DM-loop, DM-remove-target, DM-handles,
ext4-image-path, iface-name, IPv4, resolver-addr, not-symlink,
firecracker-PID, root-executable variants), the daemon's authorize
path (non-unix conn rejection + unix conn happy path), the umount2
ordering contract (deepest-first + --lazy on the sudo branch), and
positive/negative cases for the chown-no-follow fallback.

Verified end-to-end via `make smoke JOBS=4` on a KVM host.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-28 14:39:41 -03:00

614 lines
19 KiB
Go

package daemon
import (
"bufio"
"context"
"encoding/json"
"errors"
"fmt"
"log/slog"
"net"
"os"
"path/filepath"
"strings"
"sync"
"time"
"golang.org/x/sys/unix"
"banger/internal/config"
ws "banger/internal/daemon/workspace"
"banger/internal/installmeta"
"banger/internal/model"
"banger/internal/paths"
"banger/internal/roothelper"
"banger/internal/rpc"
"banger/internal/store"
"banger/internal/system"
"banger/internal/vmdns"
)
// Daemon is the composition root: shared infrastructure (store,
// runner, logger, layout, config) plus pointers to the four focused
// services that own behavior. Open wires the services; the dispatch
// loop forwards RPCs to them. No lifecycle / image / workspace /
// networking behavior lives on *Daemon itself — it's wiring.
type Daemon struct {
layout paths.Layout
userLayout paths.Layout
config model.DaemonConfig
store *store.Store
runner system.CommandRunner
logger *slog.Logger
priv privilegedOps
net *HostNetwork
img *ImageService
ws *WorkspaceService
vm *VMService
stats *StatsService
closing chan struct{}
once sync.Once
pid int
listener net.Listener
vmCaps []vmCapability
requestHandler func(context.Context, rpc.Request) rpc.Response
guestWaitForSSH func(context.Context, string, string, time.Duration) error
guestDial func(context.Context, string, string) (guestSSHClient, error)
clientUID int
clientGID int
}
func Open(ctx context.Context) (d *Daemon, err error) {
layout, err := paths.Resolve()
if err != nil {
return nil, err
}
if err := paths.Ensure(layout); err != nil {
return nil, err
}
cfg, err := config.Load(layout)
if err != nil {
return nil, err
}
return openWithConfig(ctx, layout, layout, cfg, os.Getuid(), os.Getgid(), true, nil)
}
func OpenSystem(ctx context.Context) (*Daemon, error) {
meta, err := installmeta.Load(installmeta.DefaultPath)
if err != nil {
return nil, err
}
layout := paths.ResolveSystem()
if err := paths.EnsureSystemOwned(layout); err != nil {
return nil, err
}
ownerLayout, err := paths.ResolveUserForHome(meta.OwnerHome)
if err != nil {
return nil, err
}
cfg, err := config.LoadDaemon(ownerLayout, meta.OwnerHome)
if err != nil {
return nil, err
}
// config.Load fills JailerChrootBase from the layout it sees. In
// system mode that's the owner's layout (no privileged StateDir) so
// the value lands under the owner home — wrong for the helper, which
// validates paths against the system StateDir. Override unconditionally
// here so both daemon and helper see /var/lib/banger/jail.
if strings.TrimSpace(cfg.JailerChrootBase) == "" || !filepath.IsAbs(cfg.JailerChrootBase) || strings.HasPrefix(cfg.JailerChrootBase, ownerLayout.StateDir) {
cfg.JailerChrootBase = filepath.Join(layout.StateDir, "jail")
}
helper := newHelperPrivilegedOps(roothelper.NewClient(installmeta.DefaultRootHelperSocketPath), cfg, layout)
return openWithConfig(ctx, layout, ownerLayout, cfg, -1, -1, false, helper)
}
func openWithConfig(ctx context.Context, layout, userLayout paths.Layout, cfg model.DaemonConfig, clientUID, clientGID int, syncSSHConfig bool, priv privilegedOps) (d *Daemon, err error) {
logger, normalizedLevel, err := newDaemonLogger(os.Stderr, cfg.LogLevel)
if err != nil {
return nil, err
}
cfg.LogLevel = normalizedLevel
db, err := store.Open(layout.DBPath)
if err != nil {
return nil, err
}
closing := make(chan struct{})
runner := system.NewRunner()
d = &Daemon{
layout: layout,
userLayout: userLayout,
config: cfg,
store: db,
runner: runner,
logger: logger,
closing: closing,
pid: os.Getpid(),
clientUID: clientUID,
clientGID: clientGID,
priv: priv,
}
wireServices(d)
// From here on, every failure path must run Close() so the host
// state we touched (DNS listener goroutine, resolvectl routing,
// SQLite handle, future side effects) gets unwound. Close is
// idempotent + nil-guarded so it's safe to call on a partially
// initialised daemon — `d.vmDNS == nil` and friends short-circuit
// the teardown of components we never set up.
defer func() {
if err != nil {
_ = d.Close()
}
}()
if syncSSHConfig {
d.ensureVMSSHClientConfig()
}
d.logger.Info("daemon opened", "socket", layout.SocketPath, "state_dir", layout.StateDir, "log_level", cfg.LogLevel)
if err = d.net.startVMDNS(vmdns.DefaultListenAddr); err != nil {
d.logger.Error("daemon open failed", "stage", "start_vm_dns", "error", err.Error())
return nil, err
}
if err = d.reconcile(ctx); err != nil {
d.logger.Error("daemon open failed", "stage", "reconcile", "error", err.Error())
return nil, err
}
d.net.ensureVMDNSResolverRouting(ctx)
// Seed HostNetwork's pool index from taps already claimed by VMs
// on disk so newly warmed pool entries don't collide with them.
if d.config.TapPoolSize > 0 && d.store != nil {
vms, listErr := d.store.ListVMs(ctx)
if listErr != nil {
d.logger.Error("daemon open failed", "stage", "initialize_tap_pool", "error", listErr.Error())
return nil, listErr
}
used := make([]string, 0, len(vms))
for _, vm := range vms {
if tap := d.vm.vmHandles(vm.ID).TapDevice; tap != "" {
used = append(used, tap)
}
}
d.net.initializeTapPool(used)
}
go d.net.ensureTapPool(context.Background())
return d, nil
}
func (d *Daemon) Close() error {
var err error
d.once.Do(func() {
if d.logger != nil {
d.logger.Info("daemon closing")
}
close(d.closing)
if d.listener != nil {
_ = d.listener.Close()
}
var closeErr error
if d.store != nil {
closeErr = d.store.Close()
}
err = errors.Join(d.net.clearVMDNSResolverRouting(context.Background()), d.net.stopVMDNS(), closeErr)
})
return err
}
func (d *Daemon) Serve(ctx context.Context) error {
_ = os.Remove(d.layout.SocketPath)
listener, err := net.Listen("unix", d.layout.SocketPath)
if err != nil {
if d.logger != nil {
d.logger.Error("daemon listen failed", "socket", d.layout.SocketPath, "error", err.Error())
}
return err
}
d.listener = listener
defer listener.Close()
defer os.Remove(d.layout.SocketPath)
serveDone := make(chan struct{})
defer close(serveDone)
go func() {
select {
case <-ctx.Done():
_ = listener.Close()
case <-d.closing:
case <-serveDone:
}
}()
// Tighten the socket mode while root still owns it, then hand it to
// the configured client uid/gid. In the hardened systemd unit we keep
// CAP_CHOWN but intentionally do not keep the broader file-ownership
// capability set that would be needed to chmod after chown.
if err := os.Chmod(d.layout.SocketPath, 0o600); err != nil {
return err
}
if d.clientUID >= 0 && d.clientGID >= 0 {
if err := os.Chown(d.layout.SocketPath, d.clientUID, d.clientGID); err != nil {
return err
}
}
if d.logger != nil {
d.logger.Info("daemon serving", "socket", d.layout.SocketPath, "pid", d.pid)
}
go d.backgroundLoop()
for {
conn, err := listener.Accept()
if err != nil {
select {
case <-ctx.Done():
return nil
case <-d.closing:
return nil
default:
}
if _, ok := err.(net.Error); ok {
if d.logger != nil {
d.logger.Warn("daemon accept temporary failure", "error", err.Error())
}
time.Sleep(100 * time.Millisecond)
continue
}
if d.logger != nil {
d.logger.Error("daemon accept failed", "error", err.Error())
}
return err
}
go d.handleConn(conn)
}
}
func (d *Daemon) handleConn(conn net.Conn) {
defer conn.Close()
if err := d.authorizeConn(conn); err != nil {
if d.logger != nil {
d.logger.Warn("daemon connection rejected", "remote", conn.RemoteAddr().String(), "error", err.Error())
}
_ = json.NewEncoder(conn).Encode(rpc.NewError("unauthorized", err.Error()))
return
}
reader := bufio.NewReader(conn)
var req rpc.Request
if err := json.NewDecoder(reader).Decode(&req); err != nil {
if d.logger != nil {
d.logger.Warn("daemon request decode failed", "remote", conn.RemoteAddr().String(), "error", err.Error())
}
_ = json.NewEncoder(conn).Encode(rpc.NewError("bad_request", err.Error()))
return
}
reqCtx, cancel := context.WithCancel(context.Background())
defer cancel()
stopWatch := d.watchRequestDisconnect(conn, reader, req.Method, cancel)
defer stopWatch()
resp := d.dispatch(reqCtx, req)
if reqCtx.Err() != nil {
return
}
if err := json.NewEncoder(conn).Encode(resp); err != nil && d.logger != nil {
d.logger.Warn("daemon response encode failed", "method", req.Method, "remote", conn.RemoteAddr().String(), "error", err.Error())
}
}
// authorizeConn enforces SO_PEERCRED on the daemon socket as a
// belt-and-braces check on top of filesystem perms (0600 + chowned to
// the owner). Filesystem perms already prevent other host users from
// connecting; the peer-cred read closes the door on any path that
// might leak the socket FD to a non-owner process. Mirrors the
// equivalent check in roothelper.authorizeConn.
func (d *Daemon) authorizeConn(conn net.Conn) error {
unixConn, ok := conn.(*net.UnixConn)
if !ok {
return errors.New("daemon requires unix connections")
}
rawConn, err := unixConn.SyscallConn()
if err != nil {
return err
}
var cred *unix.Ucred
var controlErr error
if err := rawConn.Control(func(fd uintptr) {
cred, controlErr = unix.GetsockoptUcred(int(fd), unix.SOL_SOCKET, unix.SO_PEERCRED)
}); err != nil {
return err
}
if controlErr != nil {
return controlErr
}
if cred == nil {
return errors.New("missing peer credentials")
}
expected := d.clientUID
if expected < 0 {
expected = os.Getuid()
}
if int(cred.Uid) == 0 || int(cred.Uid) == expected {
return nil
}
return fmt.Errorf("uid %d is not allowed to use the daemon", cred.Uid)
}
func (d *Daemon) watchRequestDisconnect(conn net.Conn, reader *bufio.Reader, method string, cancel context.CancelFunc) func() {
if conn == nil || reader == nil {
return func() {}
}
done := make(chan struct{})
var once sync.Once
go func() {
go func() {
<-done
if deadlineSetter, ok := conn.(interface{ SetReadDeadline(time.Time) error }); ok {
_ = deadlineSetter.SetReadDeadline(time.Now())
}
}()
var buf [1]byte
for {
_, err := reader.Read(buf[:])
if err == nil {
continue
}
select {
case <-done:
return
default:
}
if d.logger != nil {
d.logger.Info("daemon request canceled", "method", method, "remote", conn.RemoteAddr().String(), "error", err.Error())
}
cancel()
return
}
}()
return func() {
once.Do(func() {
close(done)
})
}
}
func (d *Daemon) dispatch(ctx context.Context, req rpc.Request) rpc.Response {
// Per-RPC correlation id is generated unconditionally — even
// errors that short-circuit before reaching a handler get one
// so the operator has a handle for every CLI failure.
// Generation can fail in theory (crypto/rand IO error) —
// degrade gracefully to a blank id rather than tearing down
// the request.
opID, _ := model.NewOpID()
if opID != "" {
ctx = WithOpID(ctx, opID)
}
stampOpID := func(resp rpc.Response) rpc.Response {
if !resp.OK && resp.Error != nil && resp.Error.OpID == "" && opID != "" {
resp.Error.OpID = opID
}
return resp
}
if req.Version != rpc.Version {
return stampOpID(rpc.NewError("bad_version", fmt.Sprintf("unsupported version %d", req.Version)))
}
if d.requestHandler != nil {
return stampOpID(d.requestHandler(ctx, req))
}
h, ok := rpcHandlers[req.Method]
if !ok {
return stampOpID(rpc.NewError("unknown_method", req.Method))
}
return stampOpID(h(ctx, d, req))
}
func (d *Daemon) backgroundLoop() {
statsTicker := time.NewTicker(d.config.StatsPollInterval)
staleTicker := time.NewTicker(model.DefaultStaleSweepInterval)
defer statsTicker.Stop()
defer staleTicker.Stop()
for {
select {
case <-d.closing:
return
case <-statsTicker.C:
if err := d.stats.pollStats(context.Background()); err != nil && d.logger != nil {
d.logger.Error("background stats poll failed", "error", err.Error())
}
case <-staleTicker.C:
if err := d.stats.stopStaleVMs(context.Background()); err != nil && d.logger != nil {
d.logger.Error("background stale sweep failed", "error", err.Error())
}
d.vm.pruneVMCreateOperations(time.Now().Add(-10 * time.Minute))
}
}
}
func (d *Daemon) reconcile(ctx context.Context) error {
op := d.beginOperation(ctx, "daemon.reconcile")
vms, err := d.store.ListVMs(ctx)
if err != nil {
return op.fail(err)
}
for _, vm := range vms {
if err := d.vm.withVMLockByIDErr(ctx, vm.ID, func(vm model.VMRecord) error {
if vm.State != model.VMStateRunning {
// Belt-and-braces: a stopped VM should never have a
// scratch file or a cache entry. Clean up anything
// left by an ungraceful previous daemon crash.
d.vm.clearVMHandles(vm)
return nil
}
// Rebuild the in-memory handle cache by loading the per-VM
// scratch file and verifying the firecracker process is
// still alive.
h, alive, err := d.vm.rediscoverHandles(ctx, vm)
if err != nil && d.logger != nil {
d.logger.Warn("rediscover handles failed", "vm_id", vm.ID, "error", err.Error())
}
// Either way, seed the cache with what the scratch file
// claimed. If alive, subsequent vmAlive() calls pass; if
// not, cleanupRuntime needs these handles to know which
// kernel resources (DM / loops / tap) to tear down.
d.vm.setVMHandlesInMemory(vm.ID, h)
if alive {
return nil
}
op.stage("stale_vm", vmLogAttrs(vm)...)
_ = d.vm.cleanupRuntime(ctx, vm, true)
vm.State = model.VMStateStopped
vm.Runtime.State = model.VMStateStopped
clearRuntimeTeardownState(&vm)
d.vm.clearVMHandles(vm)
vm.UpdatedAt = model.Now()
return d.store.UpsertVM(ctx, vm)
}); err != nil {
return op.fail(err, "vm_id", vm.ID)
}
}
if err := d.vm.rebuildDNS(ctx); err != nil {
return op.fail(err)
}
op.done()
return nil
}
// FindVM stays on Daemon as a thin forwarder to the VM service lookup.
// Dispatch code reads the facade directly; tests that pre-date the
// service split keep compiling.
func (d *Daemon) FindVM(ctx context.Context, idOrName string) (model.VMRecord, error) {
return d.vm.FindVM(ctx, idOrName)
}
// FindImage stays on Daemon as a thin forwarder to the image service
// lookup so callers reading dispatch code see the obvious facade, and
// tests that pre-date the service split still compile.
func (d *Daemon) FindImage(ctx context.Context, idOrName string) (model.Image, error) {
return d.img.FindImage(ctx, idOrName)
}
func (d *Daemon) TouchVM(ctx context.Context, idOrName string) (model.VMRecord, error) {
return d.vm.TouchVM(ctx, idOrName)
}
// wireServices populates the four focused services and their peer
// references from the infrastructure already on d (runner, logger,
// config, layout, store, closing, plus the SSH-client test seams).
// Idempotent: each service is skipped if the field is already non-nil,
// so tests can preinstall stubs for the services they want to fake and
// let wireServices fill the rest. The peer-service closures on
// WorkspaceService capture d rather than a direct *VMService pointer so
// the ws↔vm construction order doesn't recurse: the closures read d.vm
// at call time, by which point it is populated.
func wireServices(d *Daemon) {
if d.priv == nil {
clientUID, clientGID := d.clientUID, d.clientGID
if clientUID == 0 && clientGID == 0 {
clientUID, clientGID = -1, -1
}
d.priv = newLocalPrivilegedOps(d.runner, d.logger, d.config, d.layout, clientUID, clientGID)
}
if d.net == nil {
d.net = newHostNetwork(hostNetworkDeps{
runner: d.runner,
logger: d.logger,
config: d.config,
layout: d.layout,
closing: d.closing,
priv: d.priv,
})
}
if d.img == nil {
d.img = newImageService(imageServiceDeps{
runner: d.runner,
logger: d.logger,
config: d.config,
layout: d.layout,
store: d.store,
beginOperation: d.beginOperation,
})
}
if d.ws == nil {
d.ws = newWorkspaceService(workspaceServiceDeps{
runner: d.runner,
logger: d.logger,
config: d.config,
layout: d.layout,
store: d.store,
repoInspector: ws.NewInspector(),
vmResolver: func(ctx context.Context, idOrName string) (model.VMRecord, error) {
return d.vm.FindVM(ctx, idOrName)
},
aliveChecker: func(vm model.VMRecord) bool {
return d.vm.vmAlive(vm)
},
waitGuestSSH: d.waitForGuestSSH,
dialGuest: d.dialGuest,
imageResolver: func(ctx context.Context, idOrName string) (model.Image, error) {
return d.FindImage(ctx, idOrName)
},
imageWorkSeed: func(ctx context.Context, image model.Image, fingerprint string) error {
return d.img.refreshManagedWorkSeedFingerprint(ctx, image, fingerprint)
},
withVMLockByRef: func(ctx context.Context, idOrName string, fn func(model.VMRecord) (model.VMRecord, error)) (model.VMRecord, error) {
return d.vm.withVMLockByRef(ctx, idOrName, fn)
},
beginOperation: d.beginOperation,
})
}
if d.vm == nil {
d.vm = newVMService(vmServiceDeps{
runner: d.runner,
logger: d.logger,
config: d.config,
layout: d.layout,
store: d.store,
net: d.net,
img: d.img,
ws: d.ws,
priv: d.priv,
capHooks: d.buildCapabilityHooks(),
beginOperation: d.beginOperation,
vsockHostDevice: defaultVsockHostDevice,
})
}
if d.stats == nil {
// Closures capture d rather than d.vm directly, so they re-read
// d.vm at call time. Wire order (d.vm constructed above) makes
// the closures safe, but this pattern also protects against a
// future test that swaps d.vm after initial wire.
d.stats = newStatsService(statsServiceDeps{
runner: d.runner,
logger: d.logger,
config: d.config,
store: d.store,
net: d.net,
beginOperation: d.beginOperation,
vmAlive: func(vm model.VMRecord) bool { return d.vm.vmAlive(vm) },
vmHandles: func(id string) model.VMHandles { return d.vm.vmHandles(id) },
withVMLockByRef: func(ctx context.Context, idOrName string, fn func(model.VMRecord) (model.VMRecord, error)) (model.VMRecord, error) {
return d.vm.withVMLockByRef(ctx, idOrName, fn)
},
withVMLockByIDErr: func(ctx context.Context, id string, fn func(model.VMRecord) error) error {
return d.vm.withVMLockByIDErr(ctx, id, fn)
},
cleanupRuntime: func(ctx context.Context, vm model.VMRecord, preserve bool) error {
return d.vm.cleanupRuntime(ctx, vm, preserve)
},
})
}
if len(d.vmCaps) == 0 {
d.vmCaps = d.defaultCapabilities()
}
}
func marshalResultOrError(v any, err error) rpc.Response {
if err != nil {
return rpc.NewError("operation_failed", err.Error())
}
resp, marshalErr := rpc.NewResult(v)
if marshalErr != nil {
return rpc.NewError("marshal_failed", marshalErr.Error())
}
return resp
}
func exists(path string) bool {
_, err := os.Stat(path)
return err == nil
}