diff --git a/Makefile b/Makefile index b67d4ec..a83ac63 100644 --- a/Makefile +++ b/Makefile @@ -51,10 +51,10 @@ help: ' make fmt Format Go sources under cmd/ and internal/' \ ' make tidy Run go mod tidy' \ ' make clean Remove built Go binaries and coverage artefacts' \ - ' make smoke Build instrumented binaries, run scripts/smoke.sh, report coverage (needs KVM + sudo)' \ - ' make smoke-fresh smoke-clean + smoke — forces first-install paths (migrations, image pull) into the coverage stamp' \ + ' make smoke Build instrumented binaries, run the supported systemd smoke suite, report coverage (needs KVM + sudo)' \ + ' make smoke-fresh smoke-clean + smoke — purges stale smoke-owned installs before a clean supported-path run' \ ' make smoke-coverage-html HTML coverage report from the last smoke run' \ - ' make smoke-clean Remove the smoke build tree' + ' make smoke-clean Remove the smoke build tree and purge any stale smoke-owned system install' build: $(BINARIES) @@ -143,14 +143,17 @@ clean: rm -rf "$(BUILD_BIN_DIR)" coverage.out coverage.html # Smoke test suite. Builds the three banger binaries with -cover -# instrumentation under $(SMOKE_BIN_DIR), runs scripts/smoke.sh -# with GOCOVERDIR pointed at $(SMOKE_COVER_DIR), and prints the -# resulting coverage. The smoke script fully isolates state via -# XDG_* env vars pointing at a mktemp'd root, so the invoking -# user's real banger install stays untouched. +# instrumentation under $(SMOKE_BIN_DIR), installs them as temporary +# bangerd.service + bangerd-root.service, runs scripts/smoke.sh, copies +# service covdata out of /var/lib/banger, then purges the smoke-owned +# install on exit. # -# Requires a KVM-capable Linux host with sudo; fails fast via -# `banger doctor` when either is missing. This is a pre-release +# Unlike the old per-user daemon path, this touches global systemd +# state. The smoke script refuses to overwrite a pre-existing non-smoke +# install and uses a marker file so `make smoke-clean` can recover a +# stale smoke-owned install after an interrupted run. +# +# Requires a KVM-capable Linux host with sudo. This is a pre-release # gate, not CI — the Go test suite is what runs everywhere. smoke-build: $(SMOKE_BIN_DIR)/.built @@ -178,15 +181,24 @@ smoke-coverage-html: smoke @echo 'wrote $(SMOKE_DIR)/cover.html' smoke-clean: + @if sudo test -f /etc/banger/.smoke-owned; then \ + bin=''; \ + if [ -x "$(SMOKE_BIN_DIR)/banger" ]; then \ + bin="$(abspath $(SMOKE_BIN_DIR))/banger"; \ + elif [ -x "$(BANGER_BIN)" ]; then \ + bin="$(abspath $(BANGER_BIN))"; \ + elif [ -x /usr/local/bin/banger ]; then \ + bin=/usr/local/bin/banger; \ + fi; \ + if [ -n "$$bin" ]; then \ + sudo "$$bin" system uninstall --purge >/dev/null 2>&1 || true; \ + fi; \ + fi rm -rf "$(SMOKE_DIR)" -# smoke-fresh wipes everything under $(SMOKE_DIR) (instrumented -# binaries, coverage pods, persisted XDG state) and runs a full -# smoke from scratch. Useful before a release tag: the regular -# `make smoke` reuses the XDG state across runs to skip the ~290MB -# image pull, which is fast but leaves migrations and image-upsert -# paths cold on every run after the first. smoke-fresh pays the -# time cost to stamp those paths into the coverage report too. +# smoke-fresh wipes the instrumented build tree, purges any stale +# smoke-owned install, and then runs the supported-path smoke suite +# from scratch. smoke-fresh: smoke-clean smoke install: build diff --git a/README.md b/README.md index bb18bcc..96d801b 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,8 @@ One-command development sandboxes on Firecracker microVMs. ## Quick start ```bash -make install +make build +sudo ./build/bin/banger system install --owner "$USER" banger vm run --name sandbox ``` @@ -15,46 +16,95 @@ dev tools) and kernel, creates a VM, starts it, and drops you into an interactive ssh session. First run takes a couple minutes (bundle download); subsequent `vm run`s are seconds. +## Supported host path + +banger's supported host/runtime path is: + +- Linux on `x86_64 / amd64` +- `systemd` as the host init/service manager +- `bangerd.service` running as the installed owner user +- `bangerd-root.service` running as the privileged host helper + +Other setups may work with manual adaptation, but they are not the +supported operating model for this repo. + ## Requirements - **x86_64 / amd64 Linux** — arm64 is not supported today. The companion binaries, the published kernel catalog, and the OCI import path all assume `linux/amd64`. `banger doctor` surfaces this as a failing check on other architectures. +- **systemd on the host** — this is the supported service-management + path. banger's supported install/run model is the owner-user + `bangerd.service` plus the privileged `bangerd-root.service` + installed by `banger system install`. - `/dev/kvm` -- `sudo` +- `sudo` for the install/admin commands (`system install`, + `system restart`, `system uninstall`) - Firecracker on `PATH`, or `firecracker_bin` set in config - host tools checked by `banger doctor` ## Build + install ```bash -make install +make build +sudo ./build/bin/banger system install --owner "$USER" ``` -Installs `banger` (CLI), `bangerd` (daemon, auto-starts on first -CLI call), and `banger-vsock-agent` (companion, under -`$PREFIX/lib/banger/`). +This installs two systemd units, copies the current `banger`, +`bangerd`, and `banger-vsock-agent` binaries into `/usr/local`, writes +install metadata under `/etc/banger`, and starts both services: -To remove the binaries (and stop the daemon): +- `bangerd.service` runs as the configured owner user and exposes the + public CLI socket at `/run/banger/bangerd.sock`. +- `bangerd-root.service` runs as root and handles the narrow set of + privileged host operations over the private helper socket at + `/run/banger-root/bangerd-root.sock`. + +After that, normal daily commands such as `banger vm run` and +`banger image pull` are unprivileged. + +This `systemd` service flow is the supported path. If you're not on a +host that can run both services, you're outside the supported host +model even if some pieces happen to work. + +The split matters: + +- `bangerd.service` runs as the owner user, keeps its writable state in + `/var/lib/banger`, `/var/cache/banger`, and `/run/banger`, and sees + the owner home read-only. +- `bangerd-root.service` is the only process that keeps elevated host + capabilities, and that capability set is limited to the host-kernel + primitives banger actually uses (`CAP_CHOWN`, `CAP_SYS_ADMIN`, + `CAP_NET_ADMIN`). + +To inspect or refresh the services: ```bash -make uninstall +banger system status +sudo banger system restart ``` -User data stays in place — the target prints the paths so you can -`rm -rf` them if you want a full purge: +To remove the system services: -- `~/.config/banger/` — config, managed SSH keys -- `~/.local/state/banger/` — VM records, rootfs images, kernels, daemon DB/log -- `~/.cache/banger/` — OCI layer cache +```bash +sudo banger system uninstall +``` + +Add `--purge` if you also want to remove system-owned VM/image/cache +state under `/var/lib/banger`, `/var/cache/banger`, `/run/banger`, and +`/run/banger-root`. User config stays in place under your home +directory: + +- `~/.config/banger/` — config, optional `ssh_config` +- `~/.local/state/banger/ssh/` — user SSH key + known_hosts ### Shell completion `banger` ships completion scripts for bash, zsh, fish, and powershell. Tab-completion covers subcommands, flags, and live -resource names (VM, image, kernel) looked up from the -daemon. With the daemon down, resource completion silently +resource names (VM, image, kernel) looked up from the installed +services. With the services down, resource completion silently returns nothing — no file-completion fallback. ```bash @@ -105,10 +155,12 @@ logs` inspection. ## Hostnames: reaching `.vm` -banger's daemon runs a DNS server for the `.vm` zone. With host-side -DNS routing you can `curl http://sandbox.vm:3000` from anywhere on -the host — no copy-pasting guest IPs. On systemd-resolved hosts this -is auto-wired; everywhere else there's a short recipe. See +banger's owner daemon runs a DNS server for the `.vm` zone. With +host-side DNS routing you can `curl http://sandbox.vm:3000` from +anywhere on the host — no copy-pasting guest IPs. On +systemd-resolved hosts the owner daemon asks the root helper to +auto-wire this and that is the supported path. Everywhere else +there's a best-effort manual recipe. See [`docs/dns-routing.md`](docs/dns-routing.md). ### Optional: `ssh .vm` shortcut @@ -125,7 +177,9 @@ banger ssh-config # show the include line to paste manually ``` banger never touches `~/.ssh/config` on its own — the daemon keeps its -file fresh at `~/.config/banger/ssh_config`; whether and how it's +own known_hosts under `/var/lib/banger/ssh/known_hosts`, while +`banger ssh-config` keeps the user-facing file fresh at +`~/.config/banger/ssh_config`; whether and how it's pulled into your SSH config is up to you. ## Image catalog @@ -200,8 +254,12 @@ mode = "0755" # optional; default 0600 for files Runs at `vm create` time. Each entry copies `host` → `guest` onto the VM's work disk (mounted at `/root` in the guest). Guest paths -must live under `~/` or `/root/...`. Default is no entries — add the -ones you want. Symlinks encountered while recursing into a synced +must live under `~/` or `/root/...`. Host paths must live under the +installed owner's home directory; `~/...` is the intended form, and +absolute paths are accepted only when they still point inside that +home. Default is no entries — add the ones you want. A top-level +symlink is followed only when its resolved target stays inside the +owner home. Symlinks encountered while recursing into a synced directory are skipped with a warning — they'd otherwise leak files from outside the named tree (e.g. a symlink inside `~/.aws` pointing to an unrelated credential dir). diff --git a/docs/advanced.md b/docs/advanced.md index 90dee38..c05b8b5 100644 --- a/docs/advanced.md +++ b/docs/advanced.md @@ -4,6 +4,15 @@ rest: scripting, arbitrary images, custom rootfs stacks, long-lived guest processes. +Host-side assumption for everything below: the supported runtime model +is still the two-service `systemd` install: + +- `bangerd.service` running as the owner user +- `bangerd-root.service` running as the privileged host helper + +These advanced flows widen what you do with banger, not which host +init systems or privilege model are supported. + ## `vm create` — the low-level primitive Use when you want to provision without starting, or when you need to diff --git a/docs/dns-routing.md b/docs/dns-routing.md index 45f8d09..5321327 100644 --- a/docs/dns-routing.md +++ b/docs/dns-routing.md @@ -1,6 +1,6 @@ # DNS routing — resolving `.vm` hostnames from the host -banger's daemon runs a local DNS server on `127.0.0.1:42069` that +banger's owner daemon runs a local DNS server on `127.0.0.1:42069` that answers queries under the `.vm` zone. Every VM you create gets a record: @@ -17,11 +17,25 @@ curl http://devbox.vm:3000 from anywhere on the host without copy-pasting guest IPs. +## Supported path + +The supported host-side path is: + +- `systemd` on the host +- `bangerd.service` running as the owner user +- `bangerd-root.service` running as the privileged host helper +- `systemd-resolved` handling `.vm` routing via `resolvectl` + +If you're on a non-`systemd` host or a host without `systemd-resolved`, +the recipes below are best-effort guidance, not the primary supported +deployment model. + ## systemd-resolved hosts — nothing to configure If your host uses `systemd-resolved` (most modern Linux desktops — Ubuntu ≥18.04, Fedora, Arch with the service enabled), banger -auto-wires it. On daemon start it runs: +auto-wires it. When the banger services start, the owner daemon asks +the root helper to apply the equivalent of: ``` sudo resolvectl dns 127.0.0.1:42069 @@ -36,12 +50,20 @@ your normal upstream. No other changes needed. Verify: `resolvectl status br-fc` should list `127.0.0.1:42069` under **Current DNS Server** and `~vm` under **DNS Domain**. -`banger daemon stop` reverts the bridge's resolvectl state on shutdown. +Stopping or uninstalling the services reverts the bridge's +`resolvectl` state on shutdown: + +```bash +sudo banger daemon stop +sudo banger system uninstall +``` ## Non-systemd-resolved hosts banger detects `resolvectl`'s absence and skips the auto-wire. You configure your own resolver. Below are recipes for the common cases. +They can be useful in local experiments, but this is outside banger's +supported host/runtime path. In every case the goal is the same: **route `.vm` queries to `127.0.0.1` port `42069`, leave everything else alone**. @@ -114,12 +136,13 @@ the VM either doesn't exist under that name or isn't running yet. ## Troubleshooting - **`resolvectl` errors about "system has not been booted with systemd - as init system"** — you're probably inside a container. banger's - DNS still works; set up your resolver manually. + as init system"** — you're probably inside a container or on a + non-`systemd` host. Manual resolver setup may still work, but that's + outside the supported path. - **Port 42069 already in use** — another daemon is bound there (previous banger instance not shut down cleanly, or an unrelated - app). `ss -ulpn | grep 42069` shows who. `banger daemon stop` - cleans up banger's own listener. + app). `ss -ulpn | grep 42069` shows who. `sudo banger daemon stop` + stops both banger services and cleans up banger's own listener. - **`devbox.vm` resolves but SSH hangs** — DNS is fine; the VM might not be up yet or the bridge NAT is misconfigured. `banger vm ssh devbox` uses the guest IP directly and bypasses diff --git a/internal/cli/banger.go b/internal/cli/banger.go index e7312f1..ba3737d 100644 --- a/internal/cli/banger.go +++ b/internal/cli/banger.go @@ -34,6 +34,7 @@ func (d *deps) newRootCommand() *cobra.Command { d.newInternalCommand(), d.newKernelCommand(), newSSHConfigCommand(), + d.newSystemCommand(), newVersionCommand(), d.newPSCommand(), d.newVMCommand(), diff --git a/internal/cli/bangerd.go b/internal/cli/bangerd.go index 13c55a1..6911ce0 100644 --- a/internal/cli/bangerd.go +++ b/internal/cli/bangerd.go @@ -1,12 +1,17 @@ package cli import ( + "errors" + "banger/internal/daemon" + "banger/internal/roothelper" "github.com/spf13/cobra" ) func NewBangerdCommand() *cobra.Command { + var systemMode bool + var rootHelperMode bool cmd := &cobra.Command{ Use: "bangerd", Short: "Run the banger daemon", @@ -14,7 +19,22 @@ func NewBangerdCommand() *cobra.Command { SilenceErrors: true, Args: noArgsUsage("usage: bangerd"), RunE: func(cmd *cobra.Command, args []string) error { - d, err := daemon.Open(cmd.Context()) + if systemMode && rootHelperMode { + return errors.New("choose only one of --system or --root-helper") + } + if rootHelperMode { + server, err := roothelper.Open() + if err != nil { + return err + } + defer server.Close() + return server.Serve(cmd.Context()) + } + open := daemon.Open + if systemMode { + open = daemon.OpenSystem + } + d, err := open(cmd.Context()) if err != nil { return err } @@ -22,6 +42,8 @@ func NewBangerdCommand() *cobra.Command { return d.Serve(cmd.Context()) }, } + cmd.Flags().BoolVar(&systemMode, "system", false, "run as the owner-user system service") + cmd.Flags().BoolVar(&rootHelperMode, "root-helper", false, "run as the privileged root helper service") cmd.CompletionOptions.DisableDefaultCmd = true return cmd } diff --git a/internal/cli/cli_test.go b/internal/cli/cli_test.go index faef9a9..db2ca4a 100644 --- a/internal/cli/cli_test.go +++ b/internal/cli/cli_test.go @@ -30,7 +30,7 @@ func TestNewBangerCommandHasExpectedSubcommands(t *testing.T) { for _, sub := range cmd.Commands() { names = append(names, sub.Name()) } - want := []string{"daemon", "doctor", "image", "internal", "kernel", "ps", "ssh-config", "version", "vm"} + want := []string{"daemon", "doctor", "image", "internal", "kernel", "ps", "ssh-config", "system", "version", "vm"} if !reflect.DeepEqual(names, want) { t.Fatalf("subcommands = %v, want %v", names, want) } @@ -1757,48 +1757,7 @@ func TestNewBangerdCommandRejectsArgs(t *testing.T) { } } -func TestDaemonOutdated(t *testing.T) { - d := defaultDeps() - dir := t.TempDir() - current := filepath.Join(dir, "bangerd-current") - same := filepath.Join(dir, "bangerd-same") - stale := filepath.Join(dir, "bangerd-stale") - if err := os.WriteFile(current, []byte("current"), 0o755); err != nil { - t.Fatalf("write current: %v", err) - } - if err := os.Link(current, same); err != nil { - t.Fatalf("hard link: %v", err) - } - if err := os.WriteFile(stale, []byte("stale"), 0o755); err != nil { - t.Fatalf("write stale: %v", err) - } - - d.bangerdPath = func() (string, error) { - return current, nil - } - d.daemonExePath = func(pid int) string { - if pid == 1 { - return same - } - return stale - } - - if d.daemonOutdated(1) { - t.Fatal("expected matching daemon executable to be current") - } - if !d.daemonOutdated(2) { - t.Fatal("expected replaced daemon executable to be outdated") - } -} - func TestDaemonStatusIncludesLogPathWhenStopped(t *testing.T) { - configHome := filepath.Join(t.TempDir(), "config") - stateHome := filepath.Join(t.TempDir(), "state") - runtimeHome := filepath.Join(t.TempDir(), "runtime") - t.Setenv("XDG_CONFIG_HOME", configHome) - t.Setenv("XDG_STATE_HOME", stateHome) - t.Setenv("XDG_RUNTIME_DIR", runtimeHome) - cmd := NewBangerCommand() var stdout bytes.Buffer cmd.SetOut(&stdout) @@ -1809,27 +1768,20 @@ func TestDaemonStatusIncludesLogPathWhenStopped(t *testing.T) { } output := stdout.String() - if !strings.Contains(output, "stopped\n") { - t.Fatalf("output = %q, want stopped status", output) - } - if !strings.Contains(output, "log: "+filepath.Join(stateHome, "banger", "bangerd.log")) { - t.Fatalf("output = %q, want daemon log path", output) - } - if !strings.Contains(output, "dns: 127.0.0.1:42069") { - t.Fatalf("output = %q, want dns listener", output) + for _, want := range []string{ + "service: bangerd.service", + "socket: /run/banger/bangerd.sock", + "log: journalctl -u bangerd.service", + } { + if !strings.Contains(output, want) { + t.Fatalf("output = %q, want %q", output, want) + } } } func TestDaemonStatusIncludesDaemonBuildInfoWhenRunning(t *testing.T) { d := defaultDeps() - configHome := filepath.Join(t.TempDir(), "config") - stateHome := filepath.Join(t.TempDir(), "state") - runtimeHome := filepath.Join(t.TempDir(), "runtime") - t.Setenv("XDG_CONFIG_HOME", configHome) - t.Setenv("XDG_STATE_HOME", stateHome) - t.Setenv("XDG_RUNTIME_DIR", runtimeHome) - d.daemonPing = func(context.Context, string) (api.PingResult, error) { return api.PingResult{ Status: "ok", @@ -1851,12 +1803,13 @@ func TestDaemonStatusIncludesDaemonBuildInfoWhenRunning(t *testing.T) { output := stdout.String() for _, want := range []string{ - "running\n", + "service: bangerd.service", + "socket: /run/banger/bangerd.sock", + "log: journalctl -u bangerd.service", "pid: 42", "version: v1.2.3", "commit: abc123", "built_at: 2026-03-22T12:00:00Z", - "log: " + filepath.Join(stateHome, "banger", "bangerd.log"), } { if !strings.Contains(output, want) { t.Fatalf("output = %q, want %q", output, want) @@ -1864,17 +1817,6 @@ func TestDaemonStatusIncludesDaemonBuildInfoWhenRunning(t *testing.T) { } } -func TestBuildDaemonCommandIsDetachedFromCallerContext(t *testing.T) { - cmd := buildDaemonCommand("/tmp/bangerd") - - if cmd.Path != "/tmp/bangerd" { - t.Fatalf("command path = %q", cmd.Path) - } - if cmd.Cancel != nil { - t.Fatal("daemon process should not be tied to a CLI request context") - } -} - func testCLIResolvedVM(id, name string) model.VMRecord { return model.VMRecord{ID: id, Name: name} } diff --git a/internal/cli/commands_daemon.go b/internal/cli/commands_daemon.go index f2f1d86..7669118 100644 --- a/internal/cli/commands_daemon.go +++ b/internal/cli/commands_daemon.go @@ -2,15 +2,9 @@ package cli import ( "fmt" - "os" - "strings" - "banger/internal/api" - "banger/internal/buildinfo" + "banger/internal/installmeta" "banger/internal/paths" - "banger/internal/rpc" - "banger/internal/system" - "banger/internal/vmdns" "github.com/spf13/cobra" ) @@ -18,50 +12,30 @@ import ( func (d *deps) newDaemonCommand() *cobra.Command { cmd := &cobra.Command{ Use: "daemon", - Short: "Manage the banger daemon", + Short: "Manage the installed banger services", RunE: helpNoArgs, } cmd.AddCommand( &cobra.Command{ Use: "status", - Short: "Show daemon status", + Short: "Show owner-daemon and root-helper status", Args: noArgsUsage("usage: banger daemon status"), RunE: func(cmd *cobra.Command, args []string) error { - layout, err := paths.Resolve() - if err != nil { - return err - } - ping, pingErr := d.daemonPing(cmd.Context(), layout.SocketPath) - if pingErr != nil { - _, err = fmt.Fprintf(cmd.OutOrStdout(), "stopped\nsocket: %s\nlog: %s\ndns: %s\n", layout.SocketPath, layout.DaemonLog, vmdns.DefaultListenAddr) - return err - } - info := buildinfo.Normalize(ping.Version, ping.Commit, ping.BuiltAt) - _, err = fmt.Fprintf(cmd.OutOrStdout(), "running\npid: %d\n%ssocket: %s\nlog: %s\ndns: %s\n", ping.PID, formatBuildInfoBlock(info), layout.SocketPath, layout.DaemonLog, vmdns.DefaultListenAddr) - return err + return d.runSystemStatus(cmd.Context(), cmd.OutOrStdout()) }, }, &cobra.Command{ Use: "stop", - Short: "Stop the daemon", + Short: "Stop the installed banger services", Args: noArgsUsage("usage: banger daemon stop"), RunE: func(cmd *cobra.Command, args []string) error { - if err := system.EnsureSudo(cmd.Context()); err != nil { + if err := requireRoot(); err != nil { return err } - layout, err := paths.Resolve() - if err != nil { + if err := d.runSystemctl(cmd.Context(), "stop", installmeta.DefaultService, installmeta.DefaultRootHelperService); err != nil { return err } - _, err = rpc.Call[api.ShutdownResult](cmd.Context(), layout.SocketPath, "shutdown", api.Empty{}) - if err != nil { - if os.IsNotExist(err) || strings.Contains(err.Error(), "connect") { - _, writeErr := fmt.Fprintln(cmd.OutOrStdout(), "daemon not running") - return writeErr - } - return err - } - _, err = fmt.Fprintln(cmd.OutOrStdout(), "stopping") + _, err := fmt.Fprintln(cmd.OutOrStdout(), "stopped") return err }, }, @@ -70,10 +44,8 @@ func (d *deps) newDaemonCommand() *cobra.Command { Short: "Print the daemon socket path", Args: noArgsUsage("usage: banger daemon socket"), RunE: func(cmd *cobra.Command, args []string) error { - layout, err := paths.Resolve() - if err != nil { - return err - } + layout := paths.ResolveSystem() + var err error _, err = fmt.Fprintln(cmd.OutOrStdout(), layout.SocketPath) return err }, diff --git a/internal/cli/commands_image.go b/internal/cli/commands_image.go index 46e29fe..4860a8a 100644 --- a/internal/cli/commands_image.go +++ b/internal/cli/commands_image.go @@ -8,7 +8,6 @@ import ( "banger/internal/api" "banger/internal/model" "banger/internal/rpc" - "banger/internal/system" "github.com/spf13/cobra" ) @@ -43,9 +42,6 @@ func (d *deps) newImageRegisterCommand() *cobra.Command { if err := absolutizeImageRegisterPaths(¶ms); err != nil { return err } - if err := system.EnsureSudo(cmd.Context()); err != nil { - return err - } layout, _, err := d.ensureDaemon(cmd.Context()) if err != nil { return err @@ -114,9 +110,6 @@ subcommand lands). if err := absolutizePaths(¶ms.KernelPath, ¶ms.InitrdPath, ¶ms.ModulesDir); err != nil { return err } - if err := system.EnsureSudo(cmd.Context()); err != nil { - return err - } layout, _, err := d.ensureDaemon(cmd.Context()) if err != nil { return err @@ -150,9 +143,6 @@ func (d *deps) newImagePromoteCommand() *cobra.Command { Args: exactArgsUsage(1, "usage: banger image promote "), ValidArgsFunction: d.completeImageNameOnlyAtPos0, RunE: func(cmd *cobra.Command, args []string) error { - if err := system.EnsureSudo(cmd.Context()); err != nil { - return err - } layout, _, err := d.ensureDaemon(cmd.Context()) if err != nil { return err @@ -214,9 +204,6 @@ func (d *deps) newImageDeleteCommand() *cobra.Command { Args: exactArgsUsage(1, "usage: banger image delete "), ValidArgsFunction: d.completeImageNameOnlyAtPos0, RunE: func(cmd *cobra.Command, args []string) error { - if err := system.EnsureSudo(cmd.Context()); err != nil { - return err - } layout, _, err := d.ensureDaemon(cmd.Context()) if err != nil { return err diff --git a/internal/cli/commands_ssh_config.go b/internal/cli/commands_ssh_config.go index 5ce5553..51da09a 100644 --- a/internal/cli/commands_ssh_config.go +++ b/internal/cli/commands_ssh_config.go @@ -3,6 +3,7 @@ package cli import ( "fmt" + "banger/internal/config" "banger/internal/daemon" "banger/internal/paths" @@ -39,6 +40,13 @@ terminal, bypassing 'banger vm ssh': if err != nil { return err } + cfg, err := config.Load(layout) + if err != nil { + return err + } + if err := daemon.SyncVMSSHClientConfig(layout, cfg.SSHKeyPath); err != nil { + return err + } bangerConfig := daemon.BangerSSHConfigPath(layout) switch { case install: diff --git a/internal/cli/commands_system.go b/internal/cli/commands_system.go new file mode 100644 index 0000000..cad7ad1 --- /dev/null +++ b/internal/cli/commands_system.go @@ -0,0 +1,385 @@ +package cli + +import ( + "context" + "errors" + "fmt" + "io" + "os" + "path/filepath" + "strconv" + "strings" + + "banger/internal/buildinfo" + "banger/internal/installmeta" + "banger/internal/model" + "banger/internal/paths" + "banger/internal/system" + + "github.com/spf13/cobra" +) + +const ( + systemBangerBin = "/usr/local/bin/banger" + systemBangerdBin = "/usr/local/bin/bangerd" + systemCompanionDir = "/usr/local/lib/banger" + systemCompanionAgent = systemCompanionDir + "/banger-vsock-agent" + systemdUserUnitPath = "/etc/systemd/system/" + installmeta.DefaultService + systemdRootUnitPath = "/etc/systemd/system/" + installmeta.DefaultRootHelperService + systemCoverDirEnv = "BANGER_SYSTEM_GOCOVERDIR" + rootCoverDirEnv = "BANGER_ROOT_HELPER_GOCOVERDIR" +) + +func (d *deps) newSystemCommand() *cobra.Command { + var owner string + var purge bool + cmd := &cobra.Command{ + Use: "system", + Short: "Install and manage banger's system services", + RunE: helpNoArgs, + } + installCmd := &cobra.Command{ + Use: "install", + Short: "Install or refresh the owner daemon and root helper", + Args: noArgsUsage("usage: banger system install [--owner USER]"), + RunE: func(cmd *cobra.Command, args []string) error { + return d.runSystemInstall(cmd.Context(), cmd.OutOrStdout(), owner) + }, + } + installCmd.Flags().StringVar(&owner, "owner", "", "login user who will operate banger day-to-day") + + statusCmd := &cobra.Command{ + Use: "status", + Short: "Show owner-daemon and root-helper status", + Args: noArgsUsage("usage: banger system status"), + RunE: func(cmd *cobra.Command, args []string) error { + return d.runSystemStatus(cmd.Context(), cmd.OutOrStdout()) + }, + } + + restartCmd := &cobra.Command{ + Use: "restart", + Short: "Restart the installed banger services", + Args: noArgsUsage("usage: banger system restart"), + RunE: func(cmd *cobra.Command, args []string) error { + if err := requireRoot(); err != nil { + return err + } + if err := d.runSystemctl(cmd.Context(), "restart", installmeta.DefaultRootHelperService); err != nil { + return err + } + if err := d.runSystemctl(cmd.Context(), "restart", installmeta.DefaultService); err != nil { + return err + } + _, err := fmt.Fprintln(cmd.OutOrStdout(), "restarted") + return err + }, + } + + uninstallCmd := &cobra.Command{ + Use: "uninstall", + Short: "Remove the installed banger services", + Args: noArgsUsage("usage: banger system uninstall [--purge]"), + RunE: func(cmd *cobra.Command, args []string) error { + return d.runSystemUninstall(cmd.Context(), cmd.OutOrStdout(), purge) + }, + } + uninstallCmd.Flags().BoolVar(&purge, "purge", false, "also delete system-owned banger state and cache") + + cmd.AddCommand(installCmd, statusCmd, restartCmd, uninstallCmd) + return cmd +} + +func (d *deps) runSystemInstall(ctx context.Context, out io.Writer, ownerFlag string) error { + if err := requireRoot(); err != nil { + return err + } + meta, err := resolveInstallOwner(ownerFlag) + if err != nil { + return err + } + info := buildinfo.Current() + meta.Version = info.Version + meta.Commit = info.Commit + meta.BuiltAt = info.BuiltAt + meta.InstalledAt = model.Now() + + bangerBin, err := paths.BangerPath() + if err != nil { + return err + } + bangerdBin, err := paths.BangerdPath() + if err != nil { + return err + } + agentBin, err := paths.CompanionBinaryPath("banger-vsock-agent") + if err != nil { + return err + } + if err := os.MkdirAll(filepath.Dir(systemBangerBin), 0o755); err != nil { + return err + } + if err := os.MkdirAll(systemCompanionDir, 0o755); err != nil { + return err + } + if err := installFile(bangerBin, systemBangerBin, 0o755); err != nil { + return err + } + if err := installFile(bangerdBin, systemBangerdBin, 0o755); err != nil { + return err + } + if err := installFile(agentBin, systemCompanionAgent, 0o755); err != nil { + return err + } + if err := installmeta.Save(installmeta.DefaultPath, meta); err != nil { + return err + } + if err := paths.EnsureSystem(paths.ResolveSystem()); err != nil { + return err + } + if err := os.WriteFile(systemdRootUnitPath, []byte(renderRootHelperSystemdUnit()), 0o644); err != nil { + return err + } + if err := os.WriteFile(systemdUserUnitPath, []byte(renderSystemdUnit(meta)), 0o644); err != nil { + return err + } + if err := d.runSystemctl(ctx, "daemon-reload"); err != nil { + return err + } + if err := d.runSystemctl(ctx, "enable", installmeta.DefaultRootHelperService); err != nil { + return err + } + if err := d.runSystemctl(ctx, "enable", installmeta.DefaultService); err != nil { + return err + } + if err := d.runSystemctl(ctx, "restart", installmeta.DefaultRootHelperService); err != nil { + return err + } + if err := d.runSystemctl(ctx, "restart", installmeta.DefaultService); err != nil { + return err + } + _, err = fmt.Fprintf(out, "installed\nowner: %s\nsocket: %s\nhelper_socket: %s\nservice: %s\nhelper_service: %s\n", meta.OwnerUser, installmeta.DefaultSocketPath, installmeta.DefaultRootHelperSocketPath, installmeta.DefaultService, installmeta.DefaultRootHelperService) + return err +} + +func (d *deps) runSystemStatus(ctx context.Context, out io.Writer) error { + layout := paths.ResolveSystem() + active := d.systemctlQuery(ctx, "is-active", installmeta.DefaultService) + if active == "" { + active = "unknown" + } + enabled := d.systemctlQuery(ctx, "is-enabled", installmeta.DefaultService) + if enabled == "" { + enabled = "unknown" + } + helperActive := d.systemctlQuery(ctx, "is-active", installmeta.DefaultRootHelperService) + if helperActive == "" { + helperActive = "unknown" + } + helperEnabled := d.systemctlQuery(ctx, "is-enabled", installmeta.DefaultRootHelperService) + if helperEnabled == "" { + helperEnabled = "unknown" + } + fmt.Fprintf(out, "service: %s\nenabled: %s\nactive: %s\nhelper_service: %s\nhelper_enabled: %s\nhelper_active: %s\nsocket: %s\nhelper_socket: %s\nlog: journalctl -u %s -u %s\n", + installmeta.DefaultService, enabled, active, + installmeta.DefaultRootHelperService, helperEnabled, helperActive, + layout.SocketPath, installmeta.DefaultRootHelperSocketPath, + installmeta.DefaultService, installmeta.DefaultRootHelperService) + if ping, err := d.daemonPing(ctx, layout.SocketPath); err == nil { + info := buildinfo.Normalize(ping.Version, ping.Commit, ping.BuiltAt) + _, err = fmt.Fprintf(out, "pid: %d\n%s", ping.PID, formatBuildInfoBlock(info)) + return err + } + return nil +} + +func (d *deps) runSystemUninstall(ctx context.Context, out io.Writer, purge bool) error { + if err := requireRoot(); err != nil { + return err + } + _ = d.runSystemctl(ctx, "disable", "--now", installmeta.DefaultService, installmeta.DefaultRootHelperService) + _ = os.Remove(systemdUserUnitPath) + _ = os.Remove(systemdRootUnitPath) + _ = os.Remove(installmeta.DefaultPath) + _ = os.Remove(installmeta.DefaultDir) + _ = d.runSystemctl(ctx, "daemon-reload") + _ = os.Remove(systemBangerdBin) + _ = os.Remove(systemBangerBin) + _ = os.RemoveAll(systemCompanionDir) + if purge { + _ = os.RemoveAll(paths.ResolveSystem().StateDir) + _ = os.RemoveAll(paths.ResolveSystem().CacheDir) + _ = os.RemoveAll(paths.ResolveSystem().RuntimeDir) + } + msg := "uninstalled" + if purge { + msg += " (purged state)" + } + _, err := fmt.Fprintln(out, msg) + return err +} + +func resolveInstallOwner(ownerFlag string) (installmeta.Metadata, error) { + owner := strings.TrimSpace(ownerFlag) + if owner == "" { + owner = strings.TrimSpace(os.Getenv("SUDO_USER")) + } + if owner == "" { + return installmeta.Metadata{}, errors.New("owner is required; pass --owner USER when installing without sudo") + } + if owner == "root" { + return installmeta.Metadata{}, errors.New("refusing to install with root as the banger owner") + } + return installmeta.LookupOwner(owner) +} + +func renderSystemdUnit(meta installmeta.Metadata) string { + lines := []string{ + "[Unit]", + "Description=banger daemon", + "After=network-online.target", + "Wants=network-online.target " + installmeta.DefaultRootHelperService, + "After=" + installmeta.DefaultRootHelperService, + "Requires=" + installmeta.DefaultRootHelperService, + "", + "[Service]", + "Type=simple", + "User=" + meta.OwnerUser, + "ExecStart=" + systemBangerdBin + " --system", + "Restart=on-failure", + "RestartSec=1s", + "Environment=PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "Environment=TMPDIR=/run/banger", + "UMask=0077", + "NoNewPrivileges=yes", + "PrivateMounts=yes", + "ProtectSystem=strict", + "ProtectHome=read-only", + "ProtectControlGroups=yes", + "ProtectKernelLogs=yes", + "ProtectKernelModules=yes", + "ProtectClock=yes", + "ProtectHostname=yes", + "RestrictSUIDSGID=yes", + "LockPersonality=yes", + "SystemCallArchitectures=native", + "RestrictAddressFamilies=AF_UNIX AF_INET AF_INET6 AF_NETLINK AF_VSOCK", + "StateDirectory=banger", + "StateDirectoryMode=0700", + "CacheDirectory=banger", + "CacheDirectoryMode=0700", + "RuntimeDirectory=banger", + "RuntimeDirectoryMode=0700", + } + if coverDir := strings.TrimSpace(os.Getenv(systemCoverDirEnv)); coverDir != "" { + lines = append(lines, "Environment=GOCOVERDIR="+systemdQuote(coverDir)) + } + if home := strings.TrimSpace(meta.OwnerHome); home != "" { + lines = append(lines, "ReadOnlyPaths="+systemdQuote(home)) + } + lines = append(lines, + "", + "[Install]", + "WantedBy=multi-user.target", + "", + ) + return strings.Join(lines, "\n") +} + +func renderRootHelperSystemdUnit() string { + lines := []string{ + "[Unit]", + "Description=banger root helper", + "After=network-online.target", + "Wants=network-online.target", + "", + "[Service]", + "Type=simple", + "ExecStart=" + systemBangerdBin + " --root-helper", + "Restart=on-failure", + "RestartSec=1s", + "Environment=PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "Environment=TMPDIR=" + installmeta.DefaultRootHelperRuntimeDir, + "UMask=0077", + "NoNewPrivileges=yes", + "PrivateTmp=yes", + "PrivateMounts=yes", + "ProtectSystem=strict", + "ProtectHome=yes", + "ProtectControlGroups=yes", + "ProtectKernelLogs=yes", + "ProtectKernelModules=yes", + "ProtectClock=yes", + "ProtectHostname=yes", + "RestrictSUIDSGID=yes", + "LockPersonality=yes", + "SystemCallArchitectures=native", + "RestrictAddressFamilies=AF_UNIX AF_INET AF_INET6 AF_NETLINK AF_VSOCK", + "CapabilityBoundingSet=CAP_CHOWN CAP_DAC_OVERRIDE CAP_NET_ADMIN CAP_NET_RAW CAP_SYS_ADMIN", + "ReadWritePaths=/var/lib/banger", + "RuntimeDirectory=banger-root", + "RuntimeDirectoryMode=0711", + } + if coverDir := strings.TrimSpace(os.Getenv(rootCoverDirEnv)); coverDir != "" { + lines = append(lines, "Environment=GOCOVERDIR="+systemdQuote(coverDir)) + } + lines = append(lines, + "", + "[Install]", + "WantedBy=multi-user.target", + "", + ) + return strings.Join(lines, "\n") +} + +func systemdQuote(value string) string { + return strconv.Quote(strings.TrimSpace(value)) +} + +func installFile(sourcePath, targetPath string, mode os.FileMode) error { + if err := os.MkdirAll(filepath.Dir(targetPath), 0o755); err != nil { + return err + } + tempPath := targetPath + ".tmp" + _ = os.Remove(tempPath) + if err := system.CopyFilePreferClone(sourcePath, tempPath); err != nil { + return err + } + if err := os.Chmod(tempPath, mode); err != nil { + _ = os.Remove(tempPath) + return err + } + if err := os.Rename(tempPath, targetPath); err != nil { + _ = os.Remove(tempPath) + return err + } + return nil +} + +func requireRoot() error { + if os.Geteuid() == 0 { + return nil + } + return errors.New("this command requires root; run it with sudo") +} + +func (d *deps) runSystemctl(ctx context.Context, args ...string) error { + _, err := d.hostCommandOutput(ctx, "systemctl", args...) + return err +} + +func (d *deps) systemctlQuery(ctx context.Context, args ...string) string { + output, err := d.hostCommandOutput(ctx, "systemctl", args...) + if err == nil { + return strings.TrimSpace(string(output)) + } + msg := strings.TrimSpace(string(output)) + if msg != "" { + return msg + } + msg = strings.TrimSpace(err.Error()) + if idx := strings.LastIndex(msg, ": "); idx >= 0 { + return strings.TrimSpace(msg[idx+2:]) + } + return msg +} diff --git a/internal/cli/commands_vm.go b/internal/cli/commands_vm.go index f4c31fd..1db668f 100644 --- a/internal/cli/commands_vm.go +++ b/internal/cli/commands_vm.go @@ -37,7 +37,7 @@ func (d *deps) newVMCommand() *cobra.Command { d.newVMActionCommand("stop", "Stop a VM", "vm.stop"), d.newVMKillCommand(), d.newVMActionCommand("restart", "Restart a VM", "vm.restart"), - d.newVMActionCommand("delete", "Delete a VM", "vm.delete", "rm"), + d.newVMDeleteCommand(), d.newVMPruneCommand(), d.newVMSetCommand(), d.newVMSSHCommand(), @@ -143,9 +143,6 @@ Three modes: if err != nil { return err } - if err := system.EnsureSudo(cmd.Context()); err != nil { - return err - } layout, cfg, err = d.ensureDaemon(cmd.Context()) if err != nil { return err @@ -177,9 +174,6 @@ func (d *deps) newVMKillCommand() *cobra.Command { Args: minArgsUsage(1, "usage: banger vm kill [--signal SIGTERM|SIGKILL|...] ..."), ValidArgsFunction: d.completeVMNames, RunE: func(cmd *cobra.Command, args []string) error { - if err := system.EnsureSudo(cmd.Context()); err != nil { - return err - } layout, _, err := d.ensureDaemon(cmd.Context()) if err != nil { return err @@ -222,9 +216,6 @@ func (d *deps) newVMPruneCommand() *cobra.Command { Long: "Scan for VMs in state other than 'running' (stopped, created, error) and delete them after confirmation. Use -f to skip the prompt.", Args: noArgsUsage("usage: banger vm prune [-f|--force]"), RunE: func(cmd *cobra.Command, args []string) error { - if err := system.EnsureSudo(cmd.Context()); err != nil { - return err - } layout, _, err := d.ensureDaemon(cmd.Context()) if err != nil { return err @@ -288,6 +279,9 @@ func (d *deps) runVMPrune(cmd *cobra.Command, socketPath string, force bool) err failed++ continue } + if err := removeUserKnownHosts(vm); err != nil { + fmt.Fprintf(stderr, "known_hosts cleanup %s: %v\n", ref, err) + } fmt.Fprintln(stdout, "deleted", ref) } if failed > 0 { @@ -333,9 +327,6 @@ func (d *deps) newVMCreateCommand() *cobra.Command { if err != nil { return err } - if err := system.EnsureSudo(cmd.Context()); err != nil { - return err - } layout, _, err := d.ensureDaemon(cmd.Context()) if err != nil { return err @@ -462,9 +453,6 @@ func (d *deps) newVMActionCommand(use, short, method string, aliases ...string) Args: minArgsUsage(1, fmt.Sprintf("usage: banger vm %s ...", use)), ValidArgsFunction: d.completeVMNames, RunE: func(cmd *cobra.Command, args []string) error { - if err := system.EnsureSudo(cmd.Context()); err != nil { - return err - } layout, _, err := d.ensureDaemon(cmd.Context()) if err != nil { return err @@ -487,6 +475,40 @@ func (d *deps) newVMActionCommand(use, short, method string, aliases ...string) } } +func (d *deps) newVMDeleteCommand() *cobra.Command { + return &cobra.Command{ + Use: "delete ...", + Aliases: []string{"rm"}, + Short: "Delete a VM", + Args: minArgsUsage(1, "usage: banger vm delete ..."), + ValidArgsFunction: d.completeVMNames, + RunE: func(cmd *cobra.Command, args []string) error { + layout, _, err := d.ensureDaemon(cmd.Context()) + if err != nil { + return err + } + deleteOne := func(ctx context.Context, id string) (model.VMRecord, error) { + result, err := rpc.Call[api.VMShowResult](ctx, layout.SocketPath, "vm.delete", api.VMRefParams{IDOrName: id}) + if err != nil { + return model.VMRecord{}, err + } + if err := removeUserKnownHosts(result.VM); err != nil { + _, _ = fmt.Fprintf(cmd.ErrOrStderr(), "known_hosts cleanup for %s: %v\n", id, err) + } + return result.VM, nil + } + if len(args) > 1 { + return runVMBatchAction(cmd, layout.SocketPath, args, deleteOne) + } + vm, err := deleteOne(cmd.Context(), args[0]) + if err != nil { + return err + } + return printVMSummary(cmd.OutOrStdout(), vm) + }, + } +} + func (d *deps) newVMSetCommand() *cobra.Command { var ( vcpu int @@ -505,9 +527,6 @@ func (d *deps) newVMSetCommand() *cobra.Command { if err != nil { return err } - if err := system.EnsureSudo(cmd.Context()); err != nil { - return err - } layout, _, err := d.ensureDaemon(cmd.Context()) if err != nil { return err diff --git a/internal/cli/completion.go b/internal/cli/completion.go index 8032efd..d6d1a32 100644 --- a/internal/cli/completion.go +++ b/internal/cli/completion.go @@ -70,10 +70,7 @@ func defaultCompletionLister(ctx context.Context, socketPath, method string) ([] // already running. Returns "", false when no daemon is up — completion // callers use this as the bail signal. func (d *deps) daemonSocketForCompletion(ctx context.Context) (string, bool) { - layout, err := paths.Resolve() - if err != nil { - return "", false - } + layout := paths.ResolveSystem() if _, err := d.daemonPing(ctx, layout.SocketPath); err != nil { return "", false } diff --git a/internal/cli/daemon_lifecycle.go b/internal/cli/daemon_lifecycle.go index 5b8822b..ec9f011 100644 --- a/internal/cli/daemon_lifecycle.go +++ b/internal/cli/daemon_lifecycle.go @@ -2,137 +2,60 @@ package cli import ( "context" + "errors" "fmt" "os" - "os/exec" - "syscall" - "time" + "strings" - "banger/internal/api" "banger/internal/config" + "banger/internal/installmeta" "banger/internal/model" "banger/internal/paths" - "banger/internal/rpc" ) -// ensureDaemon pings the socket; on miss it auto-starts bangerd, on -// version mismatch it restarts. Every CLI command that needs to talk -// to the daemon routes through here. +var ( + loadInstallMetadata = func() (installmeta.Metadata, error) { + return installmeta.Load(installmeta.DefaultPath) + } + currentUID = os.Getuid +) + +// ensureDaemon validates that the current CLI user matches the +// installed banger owner, then pings the system socket. Every CLI +// command that needs to talk to the daemon routes through here. func (d *deps) ensureDaemon(ctx context.Context) (paths.Layout, model.DaemonConfig, error) { - layout, err := paths.Resolve() + meta, metaErr := loadInstallMetadata() + if metaErr == nil && currentUID() != meta.OwnerUID { + return paths.Layout{}, model.DaemonConfig{}, fmt.Errorf("banger is installed for %s; switch to that user or reinstall with `sudo banger system install --owner %s`", meta.OwnerUser, userHint()) + } + if metaErr != nil && !errors.Is(metaErr, os.ErrNotExist) { + return paths.Layout{}, model.DaemonConfig{}, fmt.Errorf("load %s: %w", installmeta.DefaultPath, metaErr) + } + + userLayout, err := paths.Resolve() if err != nil { return paths.Layout{}, model.DaemonConfig{}, err } - cfg, err := config.Load(layout) + cfg, err := config.Load(userLayout) if err != nil { return paths.Layout{}, model.DaemonConfig{}, err } - if ping, err := d.daemonPing(ctx, layout.SocketPath); err == nil { - if d.daemonOutdated(ping.PID) { - if err := d.restartDaemon(ctx, layout, ping.PID); err != nil { - return paths.Layout{}, model.DaemonConfig{}, err - } - return layout, cfg, nil - } + layout := paths.ResolveSystem() + if _, err := d.daemonPing(ctx, layout.SocketPath); err == nil { return layout, cfg, nil } - if err := d.startDaemon(ctx, layout); err != nil { - return paths.Layout{}, model.DaemonConfig{}, err + if metaErr == nil { + return paths.Layout{}, model.DaemonConfig{}, fmt.Errorf("banger service not reachable at %s; run `sudo banger system restart`", layout.SocketPath) } - return layout, cfg, nil + return paths.Layout{}, model.DaemonConfig{}, fmt.Errorf("banger service not running at %s; run `sudo banger system install`", layout.SocketPath) } -// daemonOutdated reports whether the running daemon binary differs -// from the one on disk — useful after `make install` when the user's -// session still holds a handle to an old daemon. os.SameFile compares -// inode + dev, so a fresh binary at the same path registers as -// different. -func (d *deps) daemonOutdated(pid int) bool { - if pid <= 0 { - return false +func userHint() string { + if sudoUser := strings.TrimSpace(os.Getenv("SUDO_USER")); sudoUser != "" { + return sudoUser } - daemonBin, err := d.bangerdPath() - if err != nil { - return false + if user := strings.TrimSpace(os.Getenv("USER")); user != "" { + return user } - currentInfo, err := os.Stat(daemonBin) - if err != nil { - return false - } - runningInfo, err := os.Stat(d.daemonExePath(pid)) - if err != nil { - return false - } - return !os.SameFile(currentInfo, runningInfo) -} - -func (d *deps) restartDaemon(ctx context.Context, layout paths.Layout, pid int) error { - stopCtx, cancel := context.WithTimeout(ctx, 2*time.Second) - defer cancel() - - _, _ = rpc.Call[api.ShutdownResult](stopCtx, layout.SocketPath, "shutdown", api.Empty{}) - if waitForPIDExit(pid, 2*time.Second) { - return d.startDaemon(ctx, layout) - } - if proc, err := os.FindProcess(pid); err == nil { - _ = proc.Signal(syscall.SIGTERM) - } - if !waitForPIDExit(pid, 2*time.Second) { - return fmt.Errorf("timed out restarting stale daemon pid %d", pid) - } - return d.startDaemon(ctx, layout) -} - -func waitForPIDExit(pid int, timeout time.Duration) bool { - deadline := time.Now().Add(timeout) - for time.Now().Before(deadline) { - if !pidRunning(pid) { - return true - } - time.Sleep(50 * time.Millisecond) - } - return !pidRunning(pid) -} - -func pidRunning(pid int) bool { - if pid <= 0 { - return false - } - proc, err := os.FindProcess(pid) - if err != nil { - return false - } - return proc.Signal(syscall.Signal(0)) == nil -} - -func (d *deps) startDaemon(ctx context.Context, layout paths.Layout) error { - if err := paths.Ensure(layout); err != nil { - return err - } - logFile, err := os.OpenFile(layout.DaemonLog, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0o644) - if err != nil { - return err - } - defer logFile.Close() - - daemonBin, err := paths.BangerdPath() - if err != nil { - return err - } - cmd := buildDaemonCommand(daemonBin) - cmd.Stdout = logFile - cmd.Stderr = logFile - cmd.Stdin = nil - cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} - if err := cmd.Start(); err != nil { - return err - } - if err := rpc.WaitForSocket(layout.SocketPath, 5*time.Second); err != nil { - return fmt.Errorf("daemon failed to start; inspect %s: %w", layout.DaemonLog, err) - } - return nil -} - -func buildDaemonCommand(daemonBin string) *exec.Cmd { - return exec.Command(daemonBin) + return "" } diff --git a/internal/cli/daemon_lifecycle_test.go b/internal/cli/daemon_lifecycle_test.go new file mode 100644 index 0000000..c050e18 --- /dev/null +++ b/internal/cli/daemon_lifecycle_test.go @@ -0,0 +1,215 @@ +package cli + +import ( + "context" + "errors" + "os" + "path/filepath" + "strings" + "testing" + + "banger/internal/api" + "banger/internal/installmeta" +) + +func TestEnsureDaemonRequiresSystemInstallWhenMetadataMissing(t *testing.T) { + t.Setenv("XDG_CONFIG_HOME", filepath.Join(t.TempDir(), "config")) + t.Setenv("XDG_STATE_HOME", filepath.Join(t.TempDir(), "state")) + t.Setenv("XDG_CACHE_HOME", filepath.Join(t.TempDir(), "cache")) + t.Setenv("XDG_RUNTIME_DIR", filepath.Join(t.TempDir(), "run")) + + restoreLoad := loadInstallMetadata + restoreUID := currentUID + t.Cleanup(func() { + loadInstallMetadata = restoreLoad + currentUID = restoreUID + }) + + loadInstallMetadata = func() (installmeta.Metadata, error) { + return installmeta.Metadata{}, os.ErrNotExist + } + currentUID = os.Getuid + + d := defaultDeps() + d.daemonPing = func(context.Context, string) (api.PingResult, error) { + return api.PingResult{}, errors.New("dial unix /run/banger/bangerd.sock: no such file") + } + + _, _, err := d.ensureDaemon(context.Background()) + if err == nil || !strings.Contains(err.Error(), "sudo banger system install") { + t.Fatalf("ensureDaemon error = %v, want install guidance", err) + } +} + +func TestEnsureDaemonSuggestsRestartWhenInstalledButUnavailable(t *testing.T) { + t.Setenv("XDG_CONFIG_HOME", filepath.Join(t.TempDir(), "config")) + t.Setenv("XDG_STATE_HOME", filepath.Join(t.TempDir(), "state")) + t.Setenv("XDG_CACHE_HOME", filepath.Join(t.TempDir(), "cache")) + t.Setenv("XDG_RUNTIME_DIR", filepath.Join(t.TempDir(), "run")) + + restoreLoad := loadInstallMetadata + restoreUID := currentUID + t.Cleanup(func() { + loadInstallMetadata = restoreLoad + currentUID = restoreUID + }) + + loadInstallMetadata = func() (installmeta.Metadata, error) { + return installmeta.Metadata{ + OwnerUser: "tester", + OwnerUID: os.Getuid(), + OwnerGID: os.Getgid(), + OwnerHome: t.TempDir(), + }, nil + } + currentUID = os.Getuid + + d := defaultDeps() + d.daemonPing = func(context.Context, string) (api.PingResult, error) { + return api.PingResult{}, errors.New("dial unix /run/banger/bangerd.sock: connection refused") + } + + _, _, err := d.ensureDaemon(context.Background()) + if err == nil || !strings.Contains(err.Error(), "sudo banger system restart") { + t.Fatalf("ensureDaemon error = %v, want restart guidance", err) + } +} + +func TestEnsureDaemonRejectsNonOwnerUser(t *testing.T) { + restoreLoad := loadInstallMetadata + restoreUID := currentUID + t.Cleanup(func() { + loadInstallMetadata = restoreLoad + currentUID = restoreUID + }) + + loadInstallMetadata = func() (installmeta.Metadata, error) { + return installmeta.Metadata{ + OwnerUser: "alice", + OwnerUID: os.Getuid() + 1, + OwnerGID: os.Getgid(), + OwnerHome: t.TempDir(), + }, nil + } + currentUID = os.Getuid + + d := defaultDeps() + d.daemonPing = func(context.Context, string) (api.PingResult, error) { + t.Fatal("daemonPing should not be called for a non-owner user") + return api.PingResult{}, nil + } + + _, _, err := d.ensureDaemon(context.Background()) + if err == nil || !strings.Contains(err.Error(), "installed for alice") { + t.Fatalf("ensureDaemon error = %v, want owner mismatch guidance", err) + } +} + +func TestSystemSubcommandFlagsAreScoped(t *testing.T) { + root := NewBangerCommand() + + systemCmd, _, err := root.Find([]string{"system"}) + if err != nil { + t.Fatalf("find system: %v", err) + } + installCmd, _, err := systemCmd.Find([]string{"install"}) + if err != nil { + t.Fatalf("find system install: %v", err) + } + uninstallCmd, _, err := systemCmd.Find([]string{"uninstall"}) + if err != nil { + t.Fatalf("find system uninstall: %v", err) + } + if installCmd.Flags().Lookup("owner") == nil { + t.Fatal("system install is missing --owner") + } + if uninstallCmd.Flags().Lookup("purge") == nil { + t.Fatal("system uninstall is missing --purge") + } +} + +func TestRenderSystemdUnitIncludesHardeningDirectives(t *testing.T) { + unit := renderSystemdUnit(installmeta.Metadata{ + OwnerUser: "alice", + OwnerUID: 1000, + OwnerGID: 1000, + OwnerHome: "/home/alice/dev home", + }) + + for _, want := range []string{ + "ExecStart=/usr/local/bin/bangerd --system", + "User=alice", + "Wants=network-online.target bangerd-root.service", + "After=bangerd-root.service", + "Requires=bangerd-root.service", + "UMask=0077", + "Environment=TMPDIR=/run/banger", + "NoNewPrivileges=yes", + "PrivateMounts=yes", + "ProtectSystem=strict", + "ProtectHome=read-only", + "ProtectControlGroups=yes", + "ProtectKernelLogs=yes", + "ProtectKernelModules=yes", + "ProtectClock=yes", + "ProtectHostname=yes", + "RestrictSUIDSGID=yes", + "LockPersonality=yes", + "SystemCallArchitectures=native", + "RestrictAddressFamilies=AF_UNIX AF_INET AF_INET6 AF_NETLINK AF_VSOCK", + "StateDirectory=banger", + "StateDirectoryMode=0700", + "CacheDirectory=banger", + "CacheDirectoryMode=0700", + "RuntimeDirectory=banger", + "RuntimeDirectoryMode=0700", + `ReadOnlyPaths="/home/alice/dev home"`, + } { + if !strings.Contains(unit, want) { + t.Fatalf("unit = %q, want %q", unit, want) + } + } +} + +func TestRenderRootHelperSystemdUnitIncludesRequiredCapabilities(t *testing.T) { + unit := renderRootHelperSystemdUnit() + + for _, want := range []string{ + "ExecStart=/usr/local/bin/bangerd --root-helper", + "Environment=TMPDIR=/run/banger-root", + "NoNewPrivileges=yes", + "PrivateTmp=yes", + "PrivateMounts=yes", + "ProtectSystem=strict", + "ProtectHome=yes", + "RestrictAddressFamilies=AF_UNIX AF_INET AF_INET6 AF_NETLINK AF_VSOCK", + "CapabilityBoundingSet=CAP_CHOWN CAP_DAC_OVERRIDE CAP_NET_ADMIN CAP_NET_RAW CAP_SYS_ADMIN", + "ReadWritePaths=/var/lib/banger", + "RuntimeDirectory=banger-root", + "RuntimeDirectoryMode=0711", + } { + if !strings.Contains(unit, want) { + t.Fatalf("unit = %q, want %q", unit, want) + } + } +} + +func TestRenderSystemdUnitsIncludeOptionalCoverageEnv(t *testing.T) { + t.Setenv(systemCoverDirEnv, "/var/lib/banger") + t.Setenv(rootCoverDirEnv, "/var/lib/banger") + + userUnit := renderSystemdUnit(installmeta.Metadata{ + OwnerUser: "alice", + OwnerUID: 1000, + OwnerGID: 1000, + OwnerHome: "/home/alice", + }) + if !strings.Contains(userUnit, `Environment=GOCOVERDIR="/var/lib/banger"`) { + t.Fatalf("user unit = %q, want GOCOVERDIR env", userUnit) + } + + rootUnit := renderRootHelperSystemdUnit() + if !strings.Contains(rootUnit, `Environment=GOCOVERDIR="/var/lib/banger"`) { + t.Fatalf("root unit = %q, want GOCOVERDIR env", rootUnit) + } +} diff --git a/internal/cli/known_hosts.go b/internal/cli/known_hosts.go new file mode 100644 index 0000000..806e3ad --- /dev/null +++ b/internal/cli/known_hosts.go @@ -0,0 +1,26 @@ +package cli + +import ( + "strings" + + "banger/internal/guest" + "banger/internal/model" +) + +func removeUserKnownHosts(vm model.VMRecord) error { + knownHostsPath, err := bangerKnownHostsPath() + if err != nil { + return err + } + var hosts []string + if ip := strings.TrimSpace(vm.Runtime.GuestIP); ip != "" { + hosts = append(hosts, ip) + } + if dns := strings.TrimSpace(vm.Runtime.DNSName); dns != "" { + hosts = append(hosts, dns) + } + if len(hosts) == 0 { + return nil + } + return guest.RemoveKnownHosts(knownHostsPath, hosts...) +} diff --git a/internal/cli/vm_run.go b/internal/cli/vm_run.go index 3c8d60d..1b8b182 100644 --- a/internal/cli/vm_run.go +++ b/internal/cli/vm_run.go @@ -158,6 +158,8 @@ func (d *deps) runVMRun(ctx context.Context, socketPath string, cfg model.Daemon defer cancel() if err := d.vmDelete(cleanupCtx, socketPath, vmRef); err != nil { printVMRunWarning(stderr, fmt.Sprintf("--rm cleanup failed: %v (leaked vm %q; delete manually)", err, vmRef)) + } else if err := removeUserKnownHosts(vm); err != nil { + printVMRunWarning(stderr, fmt.Sprintf("known_hosts cleanup failed: %v", err)) } }() } diff --git a/internal/config/config.go b/internal/config/config.go index 24cac8c..700c01a 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -52,6 +52,18 @@ type vmDefaultsFile struct { } func Load(layout paths.Layout) (model.DaemonConfig, error) { + home, err := os.UserHomeDir() + if err != nil { + return model.DaemonConfig{}, err + } + return load(layout, home, true) +} + +func LoadDaemon(layout paths.Layout, ownerHome string) (model.DaemonConfig, error) { + return load(layout, ownerHome, false) +} + +func load(layout paths.Layout, home string, ensureDefaultSSHKey bool) (model.DaemonConfig, error) { cfg := model.DaemonConfig{ LogLevel: "info", AutoStopStaleAfter: 0, @@ -62,6 +74,7 @@ func Load(layout paths.Layout) (model.DaemonConfig, error) { TapPoolSize: 4, DefaultDNS: model.DefaultDNS, DefaultImageName: "debian-bookworm", + HostHomeDir: home, } var file fileConfig @@ -122,14 +135,14 @@ func Load(layout paths.Layout) (model.DaemonConfig, error) { cfg.LogLevel = value } - sshKeyPath, err := resolveSSHKeyPath(layout, file.SSHKeyPath) + sshKeyPath, err := resolveSSHKeyPath(layout, file.SSHKeyPath, home, ensureDefaultSSHKey) if err != nil { return cfg, err } cfg.SSHKeyPath = sshKeyPath for i, entry := range file.FileSync { - validated, err := validateFileSyncEntry(entry) + validated, err := validateFileSyncEntry(entry, home) if err != nil { return cfg, fmt.Errorf("file_sync[%d]: %w", i, err) } @@ -179,9 +192,9 @@ func parseVMDefaults(file vmDefaultsFile) (model.VMDefaultsOverride, error) { // validateFileSyncEntry normalises a single `[[file_sync]]` entry // and rejects anything the operator would regret later: empty -// paths, unsupported leading characters, path traversal, or -// non-absolute guest targets. -func validateFileSyncEntry(entry fileSyncEntryFile) (model.FileSyncEntry, error) { +// paths, unsupported leading characters, path traversal, host paths +// outside the owner home, or non-absolute guest targets. +func validateFileSyncEntry(entry fileSyncEntryFile, home string) (model.FileSyncEntry, error) { host := strings.TrimSpace(entry.Host) guest := strings.TrimSpace(entry.Guest) if host == "" { @@ -190,7 +203,7 @@ func validateFileSyncEntry(entry fileSyncEntryFile) (model.FileSyncEntry, error) if guest == "" { return model.FileSyncEntry{}, fmt.Errorf("guest path is required") } - if err := validateFileSyncPath("host", host, true); err != nil { + if _, err := ResolveFileSyncHostPath(host, home); err != nil { return model.FileSyncEntry{}, err } if err := validateFileSyncPath("guest", guest, true); err != nil { @@ -211,6 +224,57 @@ func validateFileSyncEntry(entry fileSyncEntryFile) (model.FileSyncEntry, error) return model.FileSyncEntry{Host: host, Guest: guest, Mode: mode}, nil } +// ResolveFileSyncHostPath expands a configured [[file_sync]].host path +// against the owner home and rejects anything that lands outside that +// home. Both config.Load and the root daemon use this so policy cannot +// drift between startup-time validation and runtime file reads. +func ResolveFileSyncHostPath(raw, home string) (string, error) { + raw = strings.TrimSpace(raw) + if err := validateFileSyncPath("host", raw, true); err != nil { + return "", err + } + home = strings.TrimSpace(home) + if home == "" { + return "", fmt.Errorf("host path %q: owner home is required", raw) + } + if !filepath.IsAbs(home) { + return "", fmt.Errorf("host path %q: owner home %q must be absolute", raw, home) + } + candidate := raw + if strings.HasPrefix(raw, "~/") { + candidate = filepath.Join(home, strings.TrimPrefix(raw, "~/")) + } + candidate = filepath.Clean(candidate) + if !filepath.IsAbs(candidate) { + return "", fmt.Errorf("host path %q: resolved path %q must be absolute", raw, candidate) + } + if err := ensurePathWithinRoot(candidate, home); err != nil { + return "", fmt.Errorf("host path %q: %w", raw, err) + } + return candidate, nil +} + +// ResolveExistingFileSyncHostPath resolves a configured +// [[file_sync]].host path to its real on-disk target. This is the +// runtime companion to ResolveFileSyncHostPath: once os.Stat succeeds, +// the daemon uses this to ensure a top-level symlink still points +// inside the owner home before it reads from the path as root. +func ResolveExistingFileSyncHostPath(raw, home string) (string, error) { + candidate, err := ResolveFileSyncHostPath(raw, home) + if err != nil { + return "", err + } + resolved, err := filepath.EvalSymlinks(candidate) + if err != nil { + return "", fmt.Errorf("host path %q: resolve symlinks: %w", raw, err) + } + resolved = filepath.Clean(resolved) + if err := ensurePathWithinRoot(resolved, home); err != nil { + return "", fmt.Errorf("host path %q: resolved symlink target %q: %w", raw, resolved, err) + } + return resolved, nil +} + // validateFileSyncPath rejects relative paths (other than a leading // "~/"), "..", empty segments, and "~user/..." forms banger doesn't // expand. Absolute paths and home-anchored paths pass through — the @@ -240,6 +304,19 @@ func validateFileSyncPath(label, raw string, allowHome bool) error { return nil } +func ensurePathWithinRoot(candidate, root string) error { + root = filepath.Clean(strings.TrimSpace(root)) + candidate = filepath.Clean(strings.TrimSpace(candidate)) + rel, err := filepath.Rel(root, candidate) + if err != nil { + return fmt.Errorf("compare against owner home %q: %w", root, err) + } + if rel == ".." || strings.HasPrefix(rel, ".."+string(os.PathSeparator)) { + return fmt.Errorf("must stay under owner home %q", root) + } + return nil +} + // validateFileSyncMode accepts three- or four-digit octal strings. // Three-digit modes like "600" are auto-prefixed with a leading 0 // when parsed by the consumer. @@ -255,10 +332,10 @@ func validateFileSyncMode(mode string) error { return nil } -func resolveSSHKeyPath(layout paths.Layout, configured string) (string, error) { +func resolveSSHKeyPath(layout paths.Layout, configured, home string, ensureDefault bool) (string, error) { configured = strings.TrimSpace(configured) if configured != "" { - return normalizeSSHKeyPath(configured) + return normalizeSSHKeyPath(configured, home) } // Key lives under the state dir, not the config dir. The daemon's // ensureVMSSHClientConfig scrubs ConfigDir/ssh on every Open as @@ -272,7 +349,11 @@ func resolveSSHKeyPath(layout paths.Layout, configured string) (string, error) { if !filepath.IsAbs(sshDir) { return "", fmt.Errorf("ssh key dir must be absolute; got %q (check paths.Resolve populated SSHDir / StateDir)", sshDir) } - return ensureDefaultSSHKey(filepath.Join(sshDir, "id_ed25519")) + defaultPath := filepath.Join(sshDir, "id_ed25519") + if ensureDefault { + return ensureDefaultSSHKey(defaultPath) + } + return defaultPath, nil } // normalizeSSHKeyPath validates and canonicalises a user-configured @@ -289,7 +370,7 @@ func resolveSSHKeyPath(layout paths.Layout, configured string) (string, error) { // ambiguous because the daemon's cwd isn't the user's shell cwd, // and readers in internal/guest + internal/cli do raw os.ReadFile // on the path without re-resolving against a known anchor -func normalizeSSHKeyPath(raw string) (string, error) { +func normalizeSSHKeyPath(raw, home string) (string, error) { raw = strings.TrimSpace(raw) if raw == "" { return "", nil @@ -301,9 +382,9 @@ func normalizeSSHKeyPath(raw string) (string, error) { return "", fmt.Errorf("ssh_key_path %q: only '~/' is expanded, not '~user/'", raw) } if strings.HasPrefix(raw, "~/") { - home, err := os.UserHomeDir() - if err != nil { - return "", fmt.Errorf("ssh_key_path %q: expand ~/: %w", raw, err) + home = strings.TrimSpace(home) + if home == "" { + return "", fmt.Errorf("ssh_key_path %q: no home directory available for ~ expansion", raw) } raw = filepath.Join(home, strings.TrimPrefix(raw, "~/")) } diff --git a/internal/config/config_test.go b/internal/config/config_test.go index c95cc54..2a38fb6 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -70,6 +70,25 @@ func TestLoadSSHKeyPathExpandsHomeAnchored(t *testing.T) { } } +func TestLoadDaemonDoesNotGenerateDefaultSSHKey(t *testing.T) { + ownerHome := t.TempDir() + sshDir := filepath.Join(t.TempDir(), "daemon-ssh") + cfg, err := LoadDaemon(paths.Layout{ConfigDir: t.TempDir(), SSHDir: sshDir}, ownerHome) + if err != nil { + t.Fatalf("LoadDaemon: %v", err) + } + wantKey := filepath.Join(sshDir, "id_ed25519") + if cfg.SSHKeyPath != wantKey { + t.Fatalf("SSHKeyPath = %q, want %q", cfg.SSHKeyPath, wantKey) + } + if cfg.HostHomeDir != ownerHome { + t.Fatalf("HostHomeDir = %q, want %q", cfg.HostHomeDir, ownerHome) + } + if _, err := os.Stat(wantKey); !os.IsNotExist(err) { + t.Fatalf("LoadDaemon created %s, want no key material on daemon config load", wantKey) + } +} + // TestLoadNormalizesAbsoluteSSHKeyPath pins filepath.Clean behaviour // for configured paths: trailing slashes and duplicate slashes are // flattened so downstream path comparisons don't see two spellings @@ -245,15 +264,19 @@ func TestLoadAppliesLogLevelEnvOverride(t *testing.T) { } func TestLoadAcceptsFileSyncEntries(t *testing.T) { + homeDir := t.TempDir() + t.Setenv("HOME", homeDir) + configDir := t.TempDir() + hostsFile := filepath.Join(homeDir, ".config", "gh", "hosts.yml") data := []byte(` [[file_sync]] host = "~/.aws" guest = "~/.aws" [[file_sync]] -host = "/etc/resolv.conf" -guest = "/root/.config/resolv.conf" +host = "` + hostsFile + `" +guest = "/root/.config/gh/hosts.yml" mode = "0644" `) if err := os.WriteFile(filepath.Join(configDir, "config.toml"), data, 0o644); err != nil { @@ -269,11 +292,42 @@ mode = "0644" if cfg.FileSync[0].Host != "~/.aws" || cfg.FileSync[0].Guest != "~/.aws" { t.Fatalf("entry[0] = %+v", cfg.FileSync[0]) } + if cfg.FileSync[1].Host != hostsFile || cfg.FileSync[1].Guest != "/root/.config/gh/hosts.yml" { + t.Fatalf("entry[1] = %+v", cfg.FileSync[1]) + } if cfg.FileSync[1].Mode != "0644" { t.Fatalf("entry[1] mode = %q", cfg.FileSync[1].Mode) } } +func TestLoadDaemonAcceptsFileSyncPathUnderOwnerHome(t *testing.T) { + ownerHome := t.TempDir() + t.Setenv("HOME", t.TempDir()) + + configDir := t.TempDir() + allowed := filepath.Join(ownerHome, ".config", "gh", "hosts.yml") + data := []byte(` +[[file_sync]] +host = "` + allowed + `" +guest = "~/.config/gh/hosts.yml" +`) + if err := os.WriteFile(filepath.Join(configDir, "config.toml"), data, 0o644); err != nil { + t.Fatal(err) + } + + cfg, err := LoadDaemon(paths.Layout{ConfigDir: configDir, SSHDir: t.TempDir()}, ownerHome) + if err != nil { + t.Fatalf("LoadDaemon: %v", err) + } + got, err := ResolveFileSyncHostPath(cfg.FileSync[0].Host, cfg.HostHomeDir) + if err != nil { + t.Fatalf("ResolveFileSyncHostPath: %v", err) + } + if got != allowed { + t.Fatalf("resolved host path = %q, want %q", got, allowed) + } +} + func TestLoadRejectsInvalidFileSyncEntries(t *testing.T) { cases := []struct { name string @@ -333,6 +387,51 @@ func TestLoadRejectsInvalidFileSyncEntries(t *testing.T) { } } +func TestLoadRejectsFileSyncHostOutsideHome(t *testing.T) { + homeDir := t.TempDir() + t.Setenv("HOME", homeDir) + + configDir := t.TempDir() + data := []byte(` +[[file_sync]] +host = "/etc/resolv.conf" +guest = "~/resolv.conf" +`) + if err := os.WriteFile(filepath.Join(configDir, "config.toml"), data, 0o644); err != nil { + t.Fatal(err) + } + _, err := Load(paths.Layout{ConfigDir: configDir, SSHDir: t.TempDir()}) + if err == nil { + t.Fatal("Load: want error for host path outside home") + } + if !strings.Contains(err.Error(), "owner home") { + t.Fatalf("Load error = %v, want owner-home diagnostic", err) + } +} + +func TestLoadDaemonRejectsFileSyncHostOutsideOwnerHome(t *testing.T) { + ownerHome := t.TempDir() + t.Setenv("HOME", t.TempDir()) + + configDir := t.TempDir() + outside := filepath.Join(t.TempDir(), "secret.txt") + data := []byte(` +[[file_sync]] +host = "` + outside + `" +guest = "~/secret.txt" +`) + if err := os.WriteFile(filepath.Join(configDir, "config.toml"), data, 0o644); err != nil { + t.Fatal(err) + } + _, err := LoadDaemon(paths.Layout{ConfigDir: configDir, SSHDir: t.TempDir()}, ownerHome) + if err == nil { + t.Fatal("LoadDaemon: want error for host path outside owner home") + } + if !strings.Contains(err.Error(), "owner home") { + t.Fatalf("LoadDaemon error = %v, want owner-home diagnostic", err) + } +} + func TestLoadAcceptsVMDefaults(t *testing.T) { configDir := t.TempDir() data := []byte(` diff --git a/internal/daemon/ARCHITECTURE.md b/internal/daemon/ARCHITECTURE.md index 928c03b..623849c 100644 --- a/internal/daemon/ARCHITECTURE.md +++ b/internal/daemon/ARCHITECTURE.md @@ -2,16 +2,34 @@ This document describes the current daemon package layout: the `Daemon` composition root, the four services it wires together, the subpackages -that own stateless helpers, and the lock ordering every caller must +that own stateless helpers, the privileged-ops seam used by the +supported system install, and the lock ordering every caller must respect. +## Supported service topology + +On the supported host path (`banger system install` on a `systemd` +host), banger runs as two cooperating services: + +- `bangerd.service` runs as the configured owner user. It owns the + public RPC socket, store, image state, workspace prep, and the + lifecycle state machine. +- `bangerd-root.service` runs as root. It owns only the privileged + host-kernel operations: bridge/tap, NAT/resolver routing, dm/loop + snapshot plumbing, privileged ext4 mutation on dm devices, and + firecracker process/socket ownership. + +The owner daemon talks to the root helper through the `privilegedOps` +seam. Non-system/dev paths still use the same seam, but it is backed +by an in-process adapter instead of the helper RPC client. + ## Composition `Daemon` is a thin composition root. It holds shared infrastructure -(store, runner, logger, layout, config, listener) plus pointers to -four focused services. RPC dispatch is a pure forwarder into those -services; no lifecycle / image / workspace / networking behaviour -lives on `*Daemon` itself. +(store, runner, logger, layout, config, listener, privileged-ops +adapter) plus pointers to four focused services. RPC dispatch is a +pure forwarder into those services; no lifecycle / image / workspace / +networking behaviour lives on `*Daemon` itself. ``` Daemon @@ -62,6 +80,9 @@ idempotent and skips anything already set. - `tapPool` — TAP interface pool, owns its own lock. - `vmDNS *vmdns.Server` — in-process DNS server for `.vm` names. +- `privilegedOps` — the host-kernel seam used for bridge/tap/NAT, + resolver routing, dm snapshots, privileged ext4 mutation, and + firecracker ownership/kill flows. - No direct VM-state access. Where an operation needs a VM's tap name (e.g. `ensureNAT`), the signature takes `guestIP` + `tap` string so the caller (VMService) resolves them first. @@ -176,13 +197,17 @@ Notes: rehydrates the handle cache, reaps stale VMs, and republishes DNS records. `Daemon.backgroundLoop()` is the ticker fan-out — `VMService.pollStats`, `VMService.stopStaleVMs`, and -`VMService.pruneVMCreateOperations` run on independent tickers. +`VMService.pruneVMCreateOperations` run on independent tickers. On the +supported system path, any reconcile-time host cleanup that needs +privilege goes through `privilegedOps`, not directly through the owner +daemon process. ## External API Only `internal/cli` imports this package. The surface is: - `daemon.Open(ctx) (*Daemon, error)` +- `daemon.OpenSystem(ctx) (*Daemon, error)` - `(*Daemon).Serve(ctx) error` - `(*Daemon).Close() error` - `daemon.Doctor(...)` — host diagnostics (no receiver). diff --git a/internal/daemon/daemon.go b/internal/daemon/daemon.go index a10cc4a..84325ed 100644 --- a/internal/daemon/daemon.go +++ b/internal/daemon/daemon.go @@ -14,8 +14,10 @@ import ( "banger/internal/config" ws "banger/internal/daemon/workspace" + "banger/internal/installmeta" "banger/internal/model" "banger/internal/paths" + "banger/internal/roothelper" "banger/internal/rpc" "banger/internal/store" "banger/internal/system" @@ -28,11 +30,13 @@ import ( // loop forwards RPCs to them. No lifecycle / image / workspace / // networking behavior lives on *Daemon itself — it's wiring. type Daemon struct { - layout paths.Layout - config model.DaemonConfig - store *store.Store - runner system.CommandRunner - logger *slog.Logger + layout paths.Layout + userLayout paths.Layout + config model.DaemonConfig + store *store.Store + runner system.CommandRunner + logger *slog.Logger + priv privilegedOps net *HostNetwork img *ImageService @@ -48,6 +52,8 @@ type Daemon struct { requestHandler func(context.Context, rpc.Request) rpc.Response guestWaitForSSH func(context.Context, string, string, time.Duration) error guestDial func(context.Context, string, string) (guestSSHClient, error) + clientUID int + clientGID int } func Open(ctx context.Context) (d *Daemon, err error) { @@ -62,6 +68,31 @@ func Open(ctx context.Context) (d *Daemon, err error) { if err != nil { return nil, err } + return openWithConfig(ctx, layout, layout, cfg, os.Getuid(), os.Getgid(), true, nil) +} + +func OpenSystem(ctx context.Context) (*Daemon, error) { + meta, err := installmeta.Load(installmeta.DefaultPath) + if err != nil { + return nil, err + } + layout := paths.ResolveSystem() + if err := paths.EnsureSystemOwned(layout); err != nil { + return nil, err + } + ownerLayout, err := paths.ResolveUserForHome(meta.OwnerHome) + if err != nil { + return nil, err + } + cfg, err := config.LoadDaemon(ownerLayout, meta.OwnerHome) + if err != nil { + return nil, err + } + helper := newHelperPrivilegedOps(roothelper.NewClient(installmeta.DefaultRootHelperSocketPath), cfg, layout) + return openWithConfig(ctx, layout, ownerLayout, cfg, -1, -1, false, helper) +} + +func openWithConfig(ctx context.Context, layout, userLayout paths.Layout, cfg model.DaemonConfig, clientUID, clientGID int, syncSSHConfig bool, priv privilegedOps) (d *Daemon, err error) { logger, normalizedLevel, err := newDaemonLogger(os.Stderr, cfg.LogLevel) if err != nil { return nil, err @@ -74,13 +105,17 @@ func Open(ctx context.Context) (d *Daemon, err error) { closing := make(chan struct{}) runner := system.NewRunner() d = &Daemon{ - layout: layout, - config: cfg, - store: db, - runner: runner, - logger: logger, - closing: closing, - pid: os.Getpid(), + layout: layout, + userLayout: userLayout, + config: cfg, + store: db, + runner: runner, + logger: logger, + closing: closing, + pid: os.Getpid(), + clientUID: clientUID, + clientGID: clientGID, + priv: priv, } wireServices(d) // From here on, every failure path must run Close() so the host @@ -95,7 +130,9 @@ func Open(ctx context.Context) (d *Daemon, err error) { } }() - d.ensureVMSSHClientConfig() + if syncSSHConfig { + d.ensureVMSSHClientConfig() + } d.logger.Info("daemon opened", "socket", layout.SocketPath, "state_dir", layout.StateDir, "log_level", cfg.LogLevel) if err = d.net.startVMDNS(vmdns.DefaultListenAddr); err != nil { d.logger.Error("daemon open failed", "stage", "start_vm_dns", "error", err.Error()) @@ -157,9 +194,28 @@ func (d *Daemon) Serve(ctx context.Context) error { d.listener = listener defer listener.Close() defer os.Remove(d.layout.SocketPath) + serveDone := make(chan struct{}) + defer close(serveDone) + go func() { + select { + case <-ctx.Done(): + _ = listener.Close() + case <-d.closing: + case <-serveDone: + } + }() + // Tighten the socket mode while root still owns it, then hand it to + // the configured client uid/gid. In the hardened systemd unit we keep + // CAP_CHOWN but intentionally do not keep the broader file-ownership + // capability set that would be needed to chmod after chown. if err := os.Chmod(d.layout.SocketPath, 0o600); err != nil { return err } + if d.clientUID >= 0 && d.clientGID >= 0 { + if err := os.Chown(d.layout.SocketPath, d.clientUID, d.clientGID); err != nil { + return err + } + } if d.logger != nil { d.logger.Info("daemon serving", "socket", d.layout.SocketPath, "pid", d.pid) } @@ -366,6 +422,13 @@ func (d *Daemon) TouchVM(ctx context.Context, idOrName string) (model.VMRecord, // the ws↔vm construction order doesn't recurse: the closures read d.vm // at call time, by which point it is populated. func wireServices(d *Daemon) { + if d.priv == nil { + clientUID, clientGID := d.clientUID, d.clientGID + if clientUID == 0 && clientGID == 0 { + clientUID, clientGID = -1, -1 + } + d.priv = newLocalPrivilegedOps(d.runner, d.logger, d.config, d.layout, clientUID, clientGID) + } if d.net == nil { d.net = newHostNetwork(hostNetworkDeps{ runner: d.runner, @@ -373,6 +436,7 @@ func wireServices(d *Daemon) { config: d.config, layout: d.layout, closing: d.closing, + priv: d.priv, }) } if d.img == nil { @@ -425,6 +489,7 @@ func wireServices(d *Daemon) { net: d.net, img: d.img, ws: d.ws, + priv: d.priv, capHooks: d.buildCapabilityHooks(), beginOperation: d.beginOperation, vsockHostDevice: defaultVsockHostDevice, diff --git a/internal/daemon/daemon_test.go b/internal/daemon/daemon_test.go index 686b69f..6cd4545 100644 --- a/internal/daemon/daemon_test.go +++ b/internal/daemon/daemon_test.go @@ -3,10 +3,16 @@ package daemon import ( "context" "encoding/json" + "errors" + "io" + "log/slog" + "net" "os" "path/filepath" "strings" + "syscall" "testing" + "time" "banger/internal/api" "banger/internal/buildinfo" @@ -56,6 +62,75 @@ func TestDispatchPingIncludesBuildInfo(t *testing.T) { } } +func TestServeReturnsOnContextCancel(t *testing.T) { + dir := t.TempDir() + runtimeDir := filepath.Join(dir, "runtime") + if err := os.MkdirAll(runtimeDir, 0o755); err != nil { + t.Fatalf("MkdirAll runtime: %v", err) + } + socketPath := filepath.Join(runtimeDir, "bangerd.sock") + probe, err := net.Listen("unix", filepath.Join(runtimeDir, "probe.sock")) + if err != nil { + if errors.Is(err, syscall.EPERM) || strings.Contains(err.Error(), "operation not permitted") { + t.Skipf("unix socket listen blocked in this environment: %v", err) + } + t.Fatalf("probe listen: %v", err) + } + _ = probe.Close() + _ = os.Remove(filepath.Join(runtimeDir, "probe.sock")) + d := &Daemon{ + layout: paths.Layout{ + RuntimeDir: runtimeDir, + SocketPath: socketPath, + }, + config: model.DaemonConfig{ + StatsPollInterval: time.Hour, + }, + store: openDaemonStore(t), + runner: system.NewRunner(), + logger: slog.New(slog.NewTextHandler(io.Discard, nil)), + closing: make(chan struct{}), + clientUID: -1, + clientGID: -1, + } + wireServices(d) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + serveErr := make(chan error, 1) + go func() { + serveErr <- d.Serve(ctx) + }() + + deadline := time.Now().Add(2 * time.Second) + for { + if _, err := os.Stat(socketPath); err == nil { + break + } + select { + case err := <-serveErr: + t.Fatalf("Serve() returned before socket was ready: %v", err) + default: + } + if time.Now().After(deadline) { + t.Fatalf("socket %s not created before deadline", socketPath) + } + time.Sleep(25 * time.Millisecond) + } + + cancel() + + select { + case err := <-serveErr: + if err != nil { + t.Fatalf("Serve() error = %v, want nil on context cancel", err) + } + case <-time.After(2 * time.Second): + t.Fatal("Serve() did not return after context cancel") + } +} + func TestPromoteImageCopiesBootArtifactsIntoArtifactDir(t *testing.T) { dir := t.TempDir() rootfs := filepath.Join(dir, "rootfs.ext4") diff --git a/internal/daemon/dns_routing.go b/internal/daemon/dns_routing.go index 92d2c0a..0167c5a 100644 --- a/internal/daemon/dns_routing.go +++ b/internal/daemon/dns_routing.go @@ -24,14 +24,7 @@ func (n *HostNetwork) syncVMDNSResolverRouting(ctx context.Context) error { if serverAddr == "" { return nil } - if _, err := n.runner.RunSudo(ctx, "resolvectl", "dns", n.config.BridgeName, serverAddr); err != nil { - return err - } - if _, err := n.runner.RunSudo(ctx, "resolvectl", "domain", n.config.BridgeName, vmResolverRouteDomain); err != nil { - return err - } - _, err := n.runner.RunSudo(ctx, "resolvectl", "default-route", n.config.BridgeName, "no") - return err + return n.privOps().SyncResolverRouting(ctx, serverAddr) } func (n *HostNetwork) clearVMDNSResolverRouting(ctx context.Context) error { @@ -44,8 +37,7 @@ func (n *HostNetwork) clearVMDNSResolverRouting(ctx context.Context) error { if _, err := n.runner.Run(ctx, "ip", "link", "show", n.config.BridgeName); err != nil { return nil } - _, err := n.runner.RunSudo(ctx, "resolvectl", "revert", n.config.BridgeName) - return err + return n.privOps().ClearResolverRouting(ctx) } func (n *HostNetwork) ensureVMDNSResolverRouting(ctx context.Context) { diff --git a/internal/daemon/doc.go b/internal/daemon/doc.go index 151f906..d20dbf1 100644 --- a/internal/daemon/doc.go +++ b/internal/daemon/doc.go @@ -1,9 +1,16 @@ -// Package daemon hosts the Banger daemon process. +// Package daemon hosts the Banger owner-daemon process. // // The daemon exposes a JSON-RPC endpoint over a Unix socket. The // *Daemon type is a thin composition root: it holds shared -// infrastructure (store, runner, logger, layout, config, listener) -// plus pointers to four focused services and forwards RPCs to them. +// infrastructure (store, runner, logger, layout, config, listener, +// privileged-ops adapter) plus pointers to four focused services and +// forwards RPCs to them. +// +// On the supported systemd install path, this package runs inside +// `bangerd.service` as the configured owner user and delegates +// privileged host-kernel operations to `bangerd-root.service` through +// the privileged-ops seam. Non-system/dev paths use the same seam with +// an in-process adapter instead. // // Services: // diff --git a/internal/daemon/doctor.go b/internal/daemon/doctor.go index bb0e57d..d322c44 100644 --- a/internal/daemon/doctor.go +++ b/internal/daemon/doctor.go @@ -16,14 +16,15 @@ import ( ) func Doctor(ctx context.Context) (system.Report, error) { - layout, err := paths.Resolve() + userLayout, err := paths.Resolve() if err != nil { return system.Report{}, err } - cfg, err := config.Load(layout) + cfg, err := config.Load(userLayout) if err != nil { return system.Report{}, err } + layout := paths.ResolveSystem() // Doctor must be read-only: running it should never mutate the // state DB (no migrations, no WAL checkpoint, no pragma writes). // Skip OpenReadOnly entirely when the DB file doesn't exist — @@ -32,9 +33,10 @@ func Doctor(ctx context.Context) (system.Report, error) { // "no DB yet" (pass) from "DB present but unreadable" (fail) in // the report. d := &Daemon{ - layout: layout, - config: cfg, - runner: system.NewRunner(), + layout: layout, + userLayout: userLayout, + config: cfg, + runner: system.NewRunner(), } var storeErr error storeMissing := false @@ -90,7 +92,7 @@ func (d *Daemon) doctorReport(ctx context.Context, storeErr error, storeMissing // This is intentionally a warn, not a fail — the shortcut is opt-in // convenience and `banger vm ssh` works either way. func (d *Daemon) addSSHShortcutCheck(report *system.Report) { - bangerConfig := BangerSSHConfigPath(d.layout) + bangerConfig := BangerSSHConfigPath(d.userLayout) if strings.TrimSpace(bangerConfig) == "" { return } diff --git a/internal/daemon/fcproc/fcproc.go b/internal/daemon/fcproc/fcproc.go index 4b4149f..eda6b27 100644 --- a/internal/daemon/fcproc/fcproc.go +++ b/internal/daemon/fcproc/fcproc.go @@ -73,19 +73,29 @@ func (m *Manager) EnsureBridge(ctx context.Context) error { // vsock sockets all live inside, so it must be readable only by the // invoking user. func (m *Manager) EnsureSocketDir() error { - if err := os.MkdirAll(m.cfg.RuntimeDir, 0o700); err != nil { + mode := os.FileMode(0o700) + if os.Geteuid() == 0 { + mode = 0o711 + } + if err := os.MkdirAll(m.cfg.RuntimeDir, mode); err != nil { return err } - return os.Chmod(m.cfg.RuntimeDir, 0o700) + return os.Chmod(m.cfg.RuntimeDir, mode) } // CreateTap (re)creates a TAP owned by the current uid/gid, attaches it to // the bridge, and brings both up. func (m *Manager) CreateTap(ctx context.Context, tap string) error { + return m.CreateTapOwned(ctx, tap, os.Getuid(), os.Getgid()) +} + +// CreateTapOwned (re)creates a TAP owned by uid:gid, attaches it to the +// bridge, and brings both up. +func (m *Manager) CreateTapOwned(ctx context.Context, tap string, uid, gid int) error { if _, err := m.runner.Run(ctx, "ip", "link", "show", tap); err == nil { _, _ = m.runner.RunSudo(ctx, "ip", "link", "del", tap) } - if _, err := m.runner.RunSudo(ctx, "ip", "tuntap", "add", "dev", tap, "mode", "tap", "user", strconv.Itoa(os.Getuid()), "group", strconv.Itoa(os.Getgid())); err != nil { + if _, err := m.runner.RunSudo(ctx, "ip", "tuntap", "add", "dev", tap, "mode", "tap", "user", strconv.Itoa(uid), "group", strconv.Itoa(gid)); err != nil { return err } if _, err := m.runner.RunSudo(ctx, "ip", "link", "set", tap, "master", m.cfg.BridgeName); err != nil { @@ -121,13 +131,26 @@ func (m *Manager) ResolveBinary() (string, error) { // EnsureSocketAccess waits for the socket to appear then chowns/chmods it to // the current uid/gid, mode 0600. func (m *Manager) EnsureSocketAccess(ctx context.Context, socketPath, label string) error { + return m.EnsureSocketAccessFor(ctx, socketPath, label, os.Getuid(), os.Getgid()) +} + +// EnsureSocketAccessFor waits for the socket to appear then chowns/chmods it +// to uid:gid, mode 0600. +func (m *Manager) EnsureSocketAccessFor(ctx context.Context, socketPath, label string, uid, gid int) error { if err := waitForPath(ctx, socketPath, 5*time.Second, label); err != nil { return err } - if _, err := m.runner.RunSudo(ctx, "chown", fmt.Sprintf("%d:%d", os.Getuid(), os.Getgid()), socketPath); err != nil { + if os.Geteuid() == 0 { + if _, err := m.runner.Run(ctx, "chmod", "600", socketPath); err != nil { + return err + } + _, err := m.runner.Run(ctx, "chown", fmt.Sprintf("%d:%d", uid, gid), socketPath) return err } - _, err := m.runner.RunSudo(ctx, "chmod", "600", socketPath) + if _, err := m.runner.RunSudo(ctx, "chmod", "600", socketPath); err != nil { + return err + } + _, err := m.runner.RunSudo(ctx, "chown", fmt.Sprintf("%d:%d", uid, gid), socketPath) return err } diff --git a/internal/daemon/fcproc/fcproc_test.go b/internal/daemon/fcproc/fcproc_test.go index 34464e8..57b3573 100644 --- a/internal/daemon/fcproc/fcproc_test.go +++ b/internal/daemon/fcproc/fcproc_test.go @@ -107,37 +107,10 @@ func TestWaitForPathRespectsContextCancellation(t *testing.T) { } } -// TestEnsureSocketAccessChownFailureBubbles verifies a sudo chown -// error surfaces untouched. The daemon's cleanup path relies on -// this — if chown fails, the socket is still root-owned and can't -// be used by the invoking user, so we absolutely must not pretend -// success. -func TestEnsureSocketAccessChownFailureBubbles(t *testing.T) { - socketPath := filepath.Join(t.TempDir(), "present.sock") - if err := os.WriteFile(socketPath, []byte{}, 0o600); err != nil { - t.Fatalf("WriteFile: %v", err) - } - - chownErr := errors.New("sudo chown failed") - runner := &scriptedRunner{ - t: t, - sudos: []scriptedCall{{err: chownErr}}, - } - mgr := New(runner, Config{}, slog.Default()) - - err := mgr.EnsureSocketAccess(context.Background(), socketPath, "api socket") - if !errors.Is(err, chownErr) { - t.Fatalf("err = %v, want chown error", err) - } - // chmod must not have been attempted. - if len(runner.sudos) != 0 { - t.Fatalf("chmod was attempted after chown failed: %d sudo calls left", len(runner.sudos)) - } -} - // TestEnsureSocketAccessChmodFailureBubbles verifies the chmod step -// (the belt-and-braces tighten to 0600 after chown) also surfaces -// errors cleanly. +// fails fast before any ownership handoff. Once chown runs, the +// bounded helper no longer owns the socket and can't tighten its mode +// without CAP_FOWNER, so the order matters. func TestEnsureSocketAccessChmodFailureBubbles(t *testing.T) { socketPath := filepath.Join(t.TempDir(), "present.sock") if err := os.WriteFile(socketPath, []byte{}, 0o600); err != nil { @@ -146,11 +119,8 @@ func TestEnsureSocketAccessChmodFailureBubbles(t *testing.T) { chmodErr := errors.New("sudo chmod failed") runner := &scriptedRunner{ - t: t, - sudos: []scriptedCall{ - {}, // chown succeeds - {err: chmodErr}, // chmod fails - }, + t: t, + sudos: []scriptedCall{{err: chmodErr}}, } mgr := New(runner, Config{}, slog.Default()) @@ -158,6 +128,34 @@ func TestEnsureSocketAccessChmodFailureBubbles(t *testing.T) { if !errors.Is(err, chmodErr) { t.Fatalf("err = %v, want chmod error", err) } + // chown must not have been attempted. + if len(runner.sudos) != 0 { + t.Fatalf("chown was attempted after chmod failed: %d sudo calls left", len(runner.sudos)) + } +} + +// TestEnsureSocketAccessChownFailureBubbles verifies the ownership +// handoff still surfaces errors after chmod succeeds. +func TestEnsureSocketAccessChownFailureBubbles(t *testing.T) { + socketPath := filepath.Join(t.TempDir(), "present.sock") + if err := os.WriteFile(socketPath, []byte{}, 0o600); err != nil { + t.Fatalf("WriteFile: %v", err) + } + + chownErr := errors.New("sudo chown failed") + runner := &scriptedRunner{ + t: t, + sudos: []scriptedCall{ + {}, // chmod succeeds + {err: chownErr}, // chown fails + }, + } + mgr := New(runner, Config{}, slog.Default()) + + err := mgr.EnsureSocketAccess(context.Background(), socketPath, "api socket") + if !errors.Is(err, chownErr) { + t.Fatalf("err = %v, want chown error", err) + } } // TestEnsureSocketAccessTimesOutBeforeTouchingRunner pins the diff --git a/internal/daemon/host_network.go b/internal/daemon/host_network.go index 8f04a5b..9d1aa26 100644 --- a/internal/daemon/host_network.go +++ b/internal/daemon/host_network.go @@ -38,6 +38,7 @@ type HostNetwork struct { config model.DaemonConfig layout paths.Layout closing chan struct{} + priv privilegedOps tapPool tapPool vmDNS *vmdns.Server @@ -58,6 +59,7 @@ type hostNetworkDeps struct { config model.DaemonConfig layout paths.Layout closing chan struct{} + priv privilegedOps } func newHostNetwork(deps hostNetworkDeps) *HostNetwork { @@ -67,6 +69,7 @@ func newHostNetwork(deps hostNetworkDeps) *HostNetwork { config: deps.config, layout: deps.layout, closing: deps.closing, + priv: deps.priv, lookupExecutable: system.LookupExecutable, vmDNSAddr: func(server *vmdns.Server) string { return server.Addr() }, } @@ -140,7 +143,7 @@ func (n *HostNetwork) fc() *fcproc.Manager { } func (n *HostNetwork) ensureBridge(ctx context.Context) error { - return n.fc().EnsureBridge(ctx) + return n.privOps().EnsureBridge(ctx) } func (n *HostNetwork) ensureSocketDir() error { @@ -148,19 +151,19 @@ func (n *HostNetwork) ensureSocketDir() error { } func (n *HostNetwork) createTap(ctx context.Context, tap string) error { - return n.fc().CreateTap(ctx, tap) + return n.privOps().CreateTap(ctx, tap) } -func (n *HostNetwork) firecrackerBinary() (string, error) { - return n.fc().ResolveBinary() +func (n *HostNetwork) firecrackerBinary(ctx context.Context) (string, error) { + return n.privOps().ResolveFirecrackerBinary(ctx, n.config.FirecrackerBin) } func (n *HostNetwork) ensureSocketAccess(ctx context.Context, socketPath, label string) error { - return n.fc().EnsureSocketAccess(ctx, socketPath, label) + return n.privOps().EnsureSocketAccess(ctx, socketPath, label) } func (n *HostNetwork) findFirecrackerPID(ctx context.Context, apiSock string) (int, error) { - return n.fc().FindPID(ctx, apiSock) + return n.privOps().FindFirecrackerPID(ctx, apiSock) } func (n *HostNetwork) resolveFirecrackerPID(ctx context.Context, machine *firecracker.Machine, apiSock string) int { @@ -168,15 +171,35 @@ func (n *HostNetwork) resolveFirecrackerPID(ctx context.Context, machine *firecr } func (n *HostNetwork) sendCtrlAltDel(ctx context.Context, apiSockPath string) error { - return n.fc().SendCtrlAltDel(ctx, apiSockPath) + if err := n.ensureSocketAccess(ctx, apiSockPath, "firecracker api socket"); err != nil { + return err + } + return firecracker.New(apiSockPath, n.logger).SendCtrlAltDel(ctx) } func (n *HostNetwork) waitForExit(ctx context.Context, pid int, apiSock string, timeout time.Duration) error { - return n.fc().WaitForExit(ctx, pid, apiSock, timeout) + deadline := time.Now().Add(timeout) + for { + running, err := n.privOps().ProcessRunning(ctx, pid, apiSock) + if err != nil { + return err + } + if !running { + return nil + } + if time.Now().After(deadline) { + return errWaitForExitTimeout + } + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(100 * time.Millisecond): + } + } } func (n *HostNetwork) killVMProcess(ctx context.Context, pid int) error { - return n.fc().Kill(ctx, pid) + return n.privOps().KillProcess(ctx, pid) } // waitForGuestVSockAgent is a HostNetwork helper because it's diff --git a/internal/daemon/nat.go b/internal/daemon/nat.go index a879f54..2b3a7f0 100644 --- a/internal/daemon/nat.go +++ b/internal/daemon/nat.go @@ -15,7 +15,7 @@ type natRule = hostnat.Rule // Callers (vm_lifecycle) resolve the tap device from the handle cache // themselves and pass it in. func (n *HostNetwork) ensureNAT(ctx context.Context, guestIP, tap string, enable bool) error { - return hostnat.Ensure(ctx, n.runner, guestIP, tap, enable) + return n.privOps().EnsureNAT(ctx, guestIP, tap, enable) } func (n *HostNetwork) validateNATPrereqs(ctx context.Context) (string, error) { diff --git a/internal/daemon/open_close_test.go b/internal/daemon/open_close_test.go index 2d670c2..feaee22 100644 --- a/internal/daemon/open_close_test.go +++ b/internal/daemon/open_close_test.go @@ -45,6 +45,7 @@ func TestCloseOnPartiallyInitialisedDaemon(t *testing.T) { build: func(t *testing.T) *Daemon { server, err := vmdns.New("127.0.0.1:0", nil) if err != nil { + skipIfSocketRestricted(t, err) t.Fatalf("vmdns.New: %v", err) } return &Daemon{ diff --git a/internal/daemon/preflight.go b/internal/daemon/preflight.go index ff5d04e..b058815 100644 --- a/internal/daemon/preflight.go +++ b/internal/daemon/preflight.go @@ -46,7 +46,7 @@ func (s *VMService) addBaseStartPrereqs(checks *system.Preflight, image model.Im } func (s *VMService) addBaseStartCommandPrereqs(checks *system.Preflight) { - for _, command := range []string{"sudo", "ip", "dmsetup", "losetup", "blockdev", "truncate", "pgrep", "chown", "chmod", "kill", "e2cp", "e2rm", "debugfs"} { + for _, command := range []string{"ip", "dmsetup", "losetup", "blockdev", "truncate", "pgrep", "chown", "chmod", "kill", "e2cp", "e2rm", "debugfs"} { checks.RequireCommand(command, toolHint(command)) } } @@ -69,8 +69,6 @@ func toolHint(command string) string { return "install e2fsprogs" case "e2cp", "e2rm": return "install e2tools" - case "sudo": - return "install sudo" default: return "" } diff --git a/internal/daemon/privileged_ops.go b/internal/daemon/privileged_ops.go new file mode 100644 index 0000000..5e2f8b1 --- /dev/null +++ b/internal/daemon/privileged_ops.go @@ -0,0 +1,354 @@ +package daemon + +import ( + "context" + "errors" + "log/slog" + "os" + "strconv" + "strings" + "syscall" + + "banger/internal/daemon/dmsnap" + "banger/internal/daemon/fcproc" + "banger/internal/firecracker" + "banger/internal/hostnat" + "banger/internal/model" + "banger/internal/paths" + "banger/internal/roothelper" + "banger/internal/system" +) + +type privilegedOps interface { + EnsureBridge(context.Context) error + CreateTap(context.Context, string) error + DeleteTap(context.Context, string) error + SyncResolverRouting(context.Context, string) error + ClearResolverRouting(context.Context) error + EnsureNAT(context.Context, string, string, bool) error + CreateDMSnapshot(context.Context, string, string, string) (dmSnapshotHandles, error) + CleanupDMSnapshot(context.Context, dmSnapshotHandles) error + RemoveDMSnapshot(context.Context, string) error + FsckSnapshot(context.Context, string) error + ReadExt4File(context.Context, string, string) ([]byte, error) + WriteExt4Files(context.Context, string, []roothelper.Ext4Write) error + ResolveFirecrackerBinary(context.Context, string) (string, error) + LaunchFirecracker(context.Context, roothelper.FirecrackerLaunchRequest) (int, error) + EnsureSocketAccess(context.Context, string, string) error + FindFirecrackerPID(context.Context, string) (int, error) + KillProcess(context.Context, int) error + SignalProcess(context.Context, int, string) error + ProcessRunning(context.Context, int, string) (bool, error) +} + +type localPrivilegedOps struct { + runner system.CommandRunner + logger *slog.Logger + config model.DaemonConfig + layout paths.Layout + clientUID int + clientGID int +} + +func (n *HostNetwork) privOps() privilegedOps { + if n.priv == nil { + n.priv = newLocalPrivilegedOps(n.runner, n.logger, n.config, n.layout, os.Getuid(), os.Getgid()) + } + return n.priv +} + +func (s *VMService) privOps() privilegedOps { + if s.priv == nil { + s.priv = newLocalPrivilegedOps(s.runner, s.logger, s.config, s.layout, os.Getuid(), os.Getgid()) + } + return s.priv +} + +func newLocalPrivilegedOps(runner system.CommandRunner, logger *slog.Logger, cfg model.DaemonConfig, layout paths.Layout, clientUID, clientGID int) privilegedOps { + if clientUID < 0 { + clientUID = os.Getuid() + } + if clientGID < 0 { + clientGID = os.Getgid() + } + return &localPrivilegedOps{ + runner: runner, + logger: logger, + config: cfg, + layout: layout, + clientUID: clientUID, + clientGID: clientGID, + } +} + +func (o *localPrivilegedOps) EnsureBridge(ctx context.Context) error { + return o.fc().EnsureBridge(ctx) +} + +func (o *localPrivilegedOps) CreateTap(ctx context.Context, tapName string) error { + return o.fc().CreateTapOwned(ctx, tapName, o.clientUID, o.clientGID) +} + +func (o *localPrivilegedOps) DeleteTap(ctx context.Context, tapName string) error { + _, err := o.runner.RunSudo(ctx, "ip", "link", "del", tapName) + return err +} + +func (o *localPrivilegedOps) SyncResolverRouting(ctx context.Context, serverAddr string) error { + if strings.TrimSpace(o.config.BridgeName) == "" || strings.TrimSpace(serverAddr) == "" { + return nil + } + if _, err := system.LookupExecutable("resolvectl"); err != nil { + return nil + } + if _, err := o.runner.RunSudo(ctx, "resolvectl", "dns", o.config.BridgeName, serverAddr); err != nil { + return err + } + if _, err := o.runner.RunSudo(ctx, "resolvectl", "domain", o.config.BridgeName, vmResolverRouteDomain); err != nil { + return err + } + _, err := o.runner.RunSudo(ctx, "resolvectl", "default-route", o.config.BridgeName, "no") + return err +} + +func (o *localPrivilegedOps) ClearResolverRouting(ctx context.Context) error { + if strings.TrimSpace(o.config.BridgeName) == "" { + return nil + } + if _, err := system.LookupExecutable("resolvectl"); err != nil { + return nil + } + _, err := o.runner.RunSudo(ctx, "resolvectl", "revert", o.config.BridgeName) + return err +} + +func (o *localPrivilegedOps) EnsureNAT(ctx context.Context, guestIP, tap string, enable bool) error { + return hostnat.Ensure(ctx, o.runner, guestIP, tap, enable) +} + +func (o *localPrivilegedOps) CreateDMSnapshot(ctx context.Context, rootfsPath, cowPath, dmName string) (dmSnapshotHandles, error) { + return dmsnap.Create(ctx, o.runner, rootfsPath, cowPath, dmName) +} + +func (o *localPrivilegedOps) CleanupDMSnapshot(ctx context.Context, handles dmSnapshotHandles) error { + return dmsnap.Cleanup(ctx, o.runner, handles) +} + +func (o *localPrivilegedOps) RemoveDMSnapshot(ctx context.Context, target string) error { + return dmsnap.Remove(ctx, o.runner, target) +} + +func (o *localPrivilegedOps) FsckSnapshot(ctx context.Context, dmDev string) error { + if _, err := o.runner.RunSudo(ctx, "e2fsck", "-fy", dmDev); err != nil { + if code := system.ExitCode(err); code < 0 || code > 1 { + return err + } + } + return nil +} + +func (o *localPrivilegedOps) ReadExt4File(ctx context.Context, imagePath, guestPath string) ([]byte, error) { + return system.ReadExt4File(ctx, o.runner, imagePath, guestPath) +} + +func (o *localPrivilegedOps) WriteExt4Files(ctx context.Context, imagePath string, files []roothelper.Ext4Write) error { + for _, file := range files { + mode := os.FileMode(file.Mode) + if mode == 0 { + mode = 0o644 + } + if err := system.WriteExt4FileOwned(ctx, o.runner, imagePath, file.GuestPath, mode, 0, 0, file.Data); err != nil { + return err + } + } + return nil +} + +func (o *localPrivilegedOps) ResolveFirecrackerBinary(_ context.Context, requested string) (string, error) { + manager := fcproc.New(o.runner, fcproc.Config{FirecrackerBin: normalizeFirecrackerBinary(requested, o.config.FirecrackerBin)}, o.logger) + return manager.ResolveBinary() +} + +func (o *localPrivilegedOps) LaunchFirecracker(ctx context.Context, req roothelper.FirecrackerLaunchRequest) (int, error) { + machine, err := firecracker.NewMachine(ctx, firecracker.MachineConfig{ + BinaryPath: req.BinaryPath, + VMID: req.VMID, + SocketPath: req.SocketPath, + LogPath: req.LogPath, + MetricsPath: req.MetricsPath, + KernelImagePath: req.KernelImagePath, + InitrdPath: req.InitrdPath, + KernelArgs: req.KernelArgs, + Drives: req.Drives, + TapDevice: req.TapDevice, + VSockPath: req.VSockPath, + VSockCID: req.VSockCID, + VCPUCount: req.VCPUCount, + MemoryMiB: req.MemoryMiB, + Logger: o.logger, + }) + if err != nil { + return 0, err + } + if err := machine.Start(ctx); err != nil { + if pid := o.fc().ResolvePID(context.Background(), machine, req.SocketPath); pid > 0 { + _ = o.KillProcess(context.Background(), pid) + } + return 0, err + } + if err := o.EnsureSocketAccess(ctx, req.SocketPath, "firecracker api socket"); err != nil { + return 0, err + } + if strings.TrimSpace(req.VSockPath) != "" { + if err := o.EnsureSocketAccess(ctx, req.VSockPath, "firecracker vsock socket"); err != nil { + return 0, err + } + } + pid := o.fc().ResolvePID(context.Background(), machine, req.SocketPath) + if pid <= 0 { + return 0, errors.New("firecracker started but pid could not be resolved") + } + return pid, nil +} + +func (o *localPrivilegedOps) EnsureSocketAccess(ctx context.Context, socketPath, label string) error { + return o.fc().EnsureSocketAccessFor(ctx, socketPath, label, o.clientUID, o.clientGID) +} + +func (o *localPrivilegedOps) FindFirecrackerPID(ctx context.Context, apiSock string) (int, error) { + return o.fc().FindPID(ctx, apiSock) +} + +func (o *localPrivilegedOps) KillProcess(ctx context.Context, pid int) error { + return o.fc().Kill(ctx, pid) +} + +func (o *localPrivilegedOps) SignalProcess(ctx context.Context, pid int, signal string) error { + if strings.TrimSpace(signal) == "" { + signal = "TERM" + } + _, err := o.runner.RunSudo(ctx, "kill", "-"+signal, strconv.Itoa(pid)) + return err +} + +func (o *localPrivilegedOps) ProcessRunning(_ context.Context, pid int, apiSock string) (bool, error) { + return system.ProcessRunning(pid, apiSock), nil +} + +func (o *localPrivilegedOps) fc() *fcproc.Manager { + return fcproc.New(o.runner, fcproc.Config{ + FirecrackerBin: normalizeFirecrackerBinary("", o.config.FirecrackerBin), + BridgeName: o.config.BridgeName, + BridgeIP: o.config.BridgeIP, + CIDR: o.config.CIDR, + RuntimeDir: o.layout.RuntimeDir, + }, o.logger) +} + +type helperPrivilegedOps struct { + client *roothelper.Client + config model.DaemonConfig + layout paths.Layout +} + +func newHelperPrivilegedOps(client *roothelper.Client, cfg model.DaemonConfig, layout paths.Layout) privilegedOps { + return &helperPrivilegedOps{client: client, config: cfg, layout: layout} +} + +func (o *helperPrivilegedOps) EnsureBridge(ctx context.Context) error { + return o.client.EnsureBridge(ctx, o.networkConfig()) +} + +func (o *helperPrivilegedOps) CreateTap(ctx context.Context, tapName string) error { + return o.client.CreateTap(ctx, o.networkConfig(), tapName) +} + +func (o *helperPrivilegedOps) DeleteTap(ctx context.Context, tapName string) error { + return o.client.DeleteTap(ctx, tapName) +} + +func (o *helperPrivilegedOps) SyncResolverRouting(ctx context.Context, serverAddr string) error { + return o.client.SyncResolverRouting(ctx, o.config.BridgeName, serverAddr) +} + +func (o *helperPrivilegedOps) ClearResolverRouting(ctx context.Context) error { + return o.client.ClearResolverRouting(ctx, o.config.BridgeName) +} + +func (o *helperPrivilegedOps) EnsureNAT(ctx context.Context, guestIP, tap string, enable bool) error { + return o.client.EnsureNAT(ctx, guestIP, tap, enable) +} + +func (o *helperPrivilegedOps) CreateDMSnapshot(ctx context.Context, rootfsPath, cowPath, dmName string) (dmSnapshotHandles, error) { + return o.client.CreateDMSnapshot(ctx, rootfsPath, cowPath, dmName) +} + +func (o *helperPrivilegedOps) CleanupDMSnapshot(ctx context.Context, handles dmSnapshotHandles) error { + return o.client.CleanupDMSnapshot(ctx, handles) +} + +func (o *helperPrivilegedOps) RemoveDMSnapshot(ctx context.Context, target string) error { + return o.client.RemoveDMSnapshot(ctx, target) +} + +func (o *helperPrivilegedOps) FsckSnapshot(ctx context.Context, dmDev string) error { + return o.client.FsckSnapshot(ctx, dmDev) +} + +func (o *helperPrivilegedOps) ReadExt4File(ctx context.Context, imagePath, guestPath string) ([]byte, error) { + return o.client.ReadExt4File(ctx, imagePath, guestPath) +} + +func (o *helperPrivilegedOps) WriteExt4Files(ctx context.Context, imagePath string, files []roothelper.Ext4Write) error { + return o.client.WriteExt4Files(ctx, imagePath, files) +} + +func (o *helperPrivilegedOps) ResolveFirecrackerBinary(ctx context.Context, requested string) (string, error) { + return o.client.ResolveFirecrackerBinary(ctx, normalizeFirecrackerBinary(requested, o.config.FirecrackerBin)) +} + +func (o *helperPrivilegedOps) LaunchFirecracker(ctx context.Context, req roothelper.FirecrackerLaunchRequest) (int, error) { + req.Network = o.networkConfig() + return o.client.LaunchFirecracker(ctx, req) +} + +func (o *helperPrivilegedOps) EnsureSocketAccess(ctx context.Context, socketPath, label string) error { + if info, err := os.Stat(socketPath); err == nil { + if stat, ok := info.Sys().(*syscall.Stat_t); ok && int(stat.Uid) == os.Getuid() { + return os.Chmod(socketPath, 0o600) + } + } + return o.client.EnsureSocketAccess(ctx, socketPath, label) +} + +func (o *helperPrivilegedOps) FindFirecrackerPID(ctx context.Context, apiSock string) (int, error) { + return o.client.FindFirecrackerPID(ctx, apiSock) +} + +func (o *helperPrivilegedOps) KillProcess(ctx context.Context, pid int) error { + return o.client.KillProcess(ctx, pid) +} + +func (o *helperPrivilegedOps) SignalProcess(ctx context.Context, pid int, signal string) error { + return o.client.SignalProcess(ctx, pid, signal) +} + +func (o *helperPrivilegedOps) ProcessRunning(ctx context.Context, pid int, apiSock string) (bool, error) { + return o.client.ProcessRunning(ctx, pid, apiSock) +} + +func (o *helperPrivilegedOps) networkConfig() roothelper.NetworkConfig { + return roothelper.NetworkConfig{ + BridgeName: o.config.BridgeName, + BridgeIP: o.config.BridgeIP, + CIDR: o.config.CIDR, + } +} + +func normalizeFirecrackerBinary(requested, configured string) string { + requested = strings.TrimSpace(requested) + if requested != "" { + return requested + } + return strings.TrimSpace(configured) +} diff --git a/internal/daemon/snapshot.go b/internal/daemon/snapshot.go index 5835197..0515b31 100644 --- a/internal/daemon/snapshot.go +++ b/internal/daemon/snapshot.go @@ -11,13 +11,13 @@ import ( type dmSnapshotHandles = dmsnap.Handles func (n *HostNetwork) createDMSnapshot(ctx context.Context, rootfsPath, cowPath, dmName string) (dmSnapshotHandles, error) { - return dmsnap.Create(ctx, n.runner, rootfsPath, cowPath, dmName) + return n.privOps().CreateDMSnapshot(ctx, rootfsPath, cowPath, dmName) } func (n *HostNetwork) cleanupDMSnapshot(ctx context.Context, handles dmSnapshotHandles) error { - return dmsnap.Cleanup(ctx, n.runner, handles) + return n.privOps().CleanupDMSnapshot(ctx, handles) } func (n *HostNetwork) removeDMSnapshot(ctx context.Context, target string) error { - return dmsnap.Remove(ctx, n.runner, target) + return n.privOps().RemoveDMSnapshot(ctx, target) } diff --git a/internal/daemon/ssh_client_config.go b/internal/daemon/ssh_client_config.go index 455cb6a..069cc2d 100644 --- a/internal/daemon/ssh_client_config.go +++ b/internal/daemon/ssh_client_config.go @@ -57,7 +57,7 @@ func BangerSSHConfigPath(layout paths.Layout) string { } func (d *Daemon) ensureVMSSHClientConfig() { - if err := syncVMSSHClientConfig(d.layout, d.config.SSHKeyPath); err != nil && d.logger != nil { + if err := SyncVMSSHClientConfig(d.userLayout, d.config.SSHKeyPath); err != nil && d.logger != nil { d.logger.Warn("vm ssh client config sync failed", "error", err.Error()) } } @@ -68,7 +68,7 @@ func (d *Daemon) ensureVMSSHClientConfig() { // // The file lives in the banger config dir so users who manage their // SSH config declaratively can decide how (or whether) to pull it in. -func syncVMSSHClientConfig(layout paths.Layout, keyPath string) error { +func SyncVMSSHClientConfig(layout paths.Layout, keyPath string) error { keyPath = strings.TrimSpace(keyPath) if keyPath == "" { return nil diff --git a/internal/daemon/ssh_client_config_test.go b/internal/daemon/ssh_client_config_test.go index 907665d..6133217 100644 --- a/internal/daemon/ssh_client_config_test.go +++ b/internal/daemon/ssh_client_config_test.go @@ -22,8 +22,8 @@ func TestSyncVMSSHClientConfigWritesBangerFileOnly(t *testing.T) { } keyPath := filepath.Join(homeDir, ".config", "banger", "ssh", "id_ed25519") - if err := syncVMSSHClientConfig(layout, keyPath); err != nil { - t.Fatalf("syncVMSSHClientConfig: %v", err) + if err := SyncVMSSHClientConfig(layout, keyPath); err != nil { + t.Fatalf("SyncVMSSHClientConfig: %v", err) } // Banger's own ssh_config file has the `Host *.vm` stanza. diff --git a/internal/daemon/tap_pool.go b/internal/daemon/tap_pool.go index 88cf373..c0e5f60 100644 --- a/internal/daemon/tap_pool.go +++ b/internal/daemon/tap_pool.go @@ -106,7 +106,7 @@ func (n *HostNetwork) releaseTap(ctx context.Context, tapName string) error { } n.tapPool.mu.Unlock() } - _, err := n.runner.RunSudo(ctx, "ip", "link", "del", tapName) + err := n.privOps().DeleteTap(ctx, tapName) if err == nil { go n.ensureTapPool(context.Background()) } diff --git a/internal/daemon/vm_authsync.go b/internal/daemon/vm_authsync.go index b9a429e..b4feaaa 100644 --- a/internal/daemon/vm_authsync.go +++ b/internal/daemon/vm_authsync.go @@ -10,6 +10,7 @@ import ( "strconv" "strings" + "banger/internal/config" "banger/internal/guest" "banger/internal/model" "banger/internal/system" @@ -120,15 +121,22 @@ func (s *WorkspaceService) runFileSync(ctx context.Context, vm *model.VMRecord) runner = system.NewRunner() } - hostHome, err := os.UserHomeDir() - if err != nil { - return fmt.Errorf("resolve host user home: %w", err) + hostHome := strings.TrimSpace(s.config.HostHomeDir) + if hostHome == "" { + var err error + hostHome, err = os.UserHomeDir() + if err != nil { + return fmt.Errorf("resolve host user home: %w", err) + } } workDisk := vm.Runtime.WorkDiskPath for _, entry := range s.config.FileSync { - hostPath := expandHostPath(entry.Host, hostHome) + hostPath, err := config.ResolveFileSyncHostPath(entry.Host, hostHome) + if err != nil { + return fmt.Errorf("file_sync: %w", err) + } guestRel := guestPathRelativeToRoot(entry.Guest) guestImagePath := "/" + guestRel @@ -140,6 +148,10 @@ func (s *WorkspaceService) runFileSync(ctx context.Context, vm *model.VMRecord) } return fmt.Errorf("file_sync: stat %s: %w", hostPath, err) } + hostPath, err = config.ResolveExistingFileSyncHostPath(entry.Host, hostHome) + if err != nil { + return fmt.Errorf("file_sync: %w", err) + } vmCreateStage(ctx, "prepare_work_disk", "file sync: "+entry.Host+" → "+entry.Guest) @@ -180,8 +192,8 @@ func (s *WorkspaceService) runFileSync(ctx context.Context, vm *model.VMRecord) // inside ~/.aws that points at ~/secrets can't leak out of the tree // the user named. Other special types (devices, FIFOs) are skipped // silently. Top-level host paths go through os.Stat back in -// runFileSync and still follow, since the user explicitly named that -// path. +// runFileSync and may still follow, but only when the resolved target +// stays under the configured owner home. func (s *WorkspaceService) copyHostDir(ctx context.Context, vm model.VMRecord, runner system.CommandRunner, imagePath, hostDir, guestTarget string) error { if err := system.MkdirExt4(ctx, runner, imagePath, guestTarget, 0o755, 0, 0); err != nil { return err @@ -234,15 +246,6 @@ func parseFileSyncMode(raw string) (os.FileMode, error) { } // expandHostPath expands a leading "~/" against the host user's -// home. Already-absolute paths pass through unchanged. -func expandHostPath(raw, home string) string { - raw = strings.TrimSpace(raw) - if strings.HasPrefix(raw, "~/") { - return filepath.Join(home, strings.TrimPrefix(raw, "~/")) - } - return raw -} - // guestPathRelativeToRoot returns the guest path as a relative path // under /root (banger's work disk is mounted at /root in the guest, // so everything syncable lives there). "~/foo" and "/root/foo" both diff --git a/internal/daemon/vm_disk.go b/internal/daemon/vm_disk.go index a8b84be..f9e5166 100644 --- a/internal/daemon/vm_disk.go +++ b/internal/daemon/vm_disk.go @@ -10,6 +10,7 @@ import ( "banger/internal/guestconfig" "banger/internal/guestnet" "banger/internal/model" + "banger/internal/roothelper" "banger/internal/system" ) @@ -27,18 +28,19 @@ func (s *VMService) ensureSystemOverlay(ctx context.Context, vm *model.VMRecord) // patchRootOverlay writes the per-VM config files (resolv.conf, // hostname, hosts, sshd drop-in, network bootstrap, fstab) into the -// rootfs overlay. Reads the DM device path from the handle cache, -// which the start flow populates before calling this. -func (s *VMService) patchRootOverlay(ctx context.Context, vm model.VMRecord, image model.Image) error { - dmDev := s.vmHandles(vm.ID).DMDev - if dmDev == "" { - return fmt.Errorf("vm %q: DM device not in handle cache — start flow out of order?", vm.ID) +// rootfs overlay. The start flow passes the DM device path explicitly so the +// owner daemon can hand the privileged ext4 work to the root helper without +// rereading mutable process state. +func (s *VMService) patchRootOverlay(ctx context.Context, vm model.VMRecord, image model.Image, dmDev string) error { + if strings.TrimSpace(dmDev) == "" { + return fmt.Errorf("vm %q: DM device is required", vm.ID) } resolv := []byte(fmt.Sprintf("nameserver %s\n", s.config.DefaultDNS)) hostname := []byte(vm.Name + "\n") hosts := []byte(fmt.Sprintf("127.0.0.1 localhost\n127.0.1.1 %s\n", vm.Name)) sshdConfig := []byte(sshdGuestConfig()) - fstab, err := system.ReadDebugFSText(ctx, s.runner, dmDev, "/etc/fstab") + fstabBytes, err := s.privOps().ReadExt4File(ctx, dmDev, "/etc/fstab") + fstab := string(fstabBytes) if err != nil { fstab = "" } @@ -70,19 +72,19 @@ func (s *VMService) patchRootOverlay(ctx context.Context, vm model.VMRecord, ima s.capHooks.contributeGuest(builder, vm, image) builder.WriteFile("/etc/fstab", []byte(builder.RenderFSTab(fstab))) files := builder.Files() + writes := make([]roothelper.Ext4Write, 0, len(files)) for _, guestPath := range builder.FilePaths() { - data := files[guestPath] + mode := uint32(0o644) if guestPath == guestnet.GuestScriptPath { - if err := system.WriteExt4FileMode(ctx, s.runner, dmDev, guestPath, 0o755, data); err != nil { - return err - } - continue - } - if err := system.WriteExt4File(ctx, s.runner, dmDev, guestPath, data); err != nil { - return err + mode = 0o755 } + writes = append(writes, roothelper.Ext4Write{ + GuestPath: guestPath, + Data: files[guestPath], + Mode: mode, + }) } - return nil + return s.privOps().WriteExt4Files(ctx, dmDev, writes) } func (s *VMService) ensureWorkDisk(ctx context.Context, vm *model.VMRecord, image model.Image) (workDiskPreparation, error) { diff --git a/internal/daemon/vm_handles.go b/internal/daemon/vm_handles.go index febf467..2ba9790 100644 --- a/internal/daemon/vm_handles.go +++ b/internal/daemon/vm_handles.go @@ -10,7 +10,6 @@ import ( "sync" "banger/internal/model" - "banger/internal/system" ) // handleCache is the daemon's in-memory map of per-VM transient @@ -175,7 +174,8 @@ func (s *VMService) vmAlive(vm model.VMRecord) bool { if h.PID <= 0 { return false } - return system.ProcessRunning(h.PID, vm.Runtime.APISockPath) + running, err := s.privOps().ProcessRunning(context.Background(), h.PID, vm.Runtime.APISockPath) + return err == nil && running } // rediscoverHandles loads what the last daemon start knew about a VM @@ -207,8 +207,10 @@ func (s *VMService) rediscoverHandles(ctx context.Context, vm model.VMRecord) (m saved.PID = pid return saved, true, nil } - if saved.PID > 0 && system.ProcessRunning(saved.PID, apiSock) { - return saved, true, nil + if saved.PID > 0 { + if running, runErr := s.privOps().ProcessRunning(ctx, saved.PID, apiSock); runErr == nil && running { + return saved, true, nil + } } return saved, false, nil } diff --git a/internal/daemon/vm_lifecycle.go b/internal/daemon/vm_lifecycle.go index de43caf..cb4f3b0 100644 --- a/internal/daemon/vm_lifecycle.go +++ b/internal/daemon/vm_lifecycle.go @@ -5,7 +5,6 @@ import ( "errors" "os" "path/filepath" - "strconv" "strings" "time" @@ -184,7 +183,7 @@ func (s *VMService) killVMLocked(ctx context.Context, current model.VMRecord, si } pid := s.vmHandles(vm.ID).PID op.stage("send_signal", "pid", pid, "signal", signal) - if _, err := s.runner.RunSudo(ctx, "kill", "-"+signal, strconv.Itoa(pid)); err != nil { + if err := s.privOps().SignalProcess(ctx, pid, signal); err != nil { return model.VMRecord{}, err } op.stage("wait_for_exit", "pid", pid) diff --git a/internal/daemon/vm_lifecycle_steps.go b/internal/daemon/vm_lifecycle_steps.go index 718e5ed..f932a15 100644 --- a/internal/daemon/vm_lifecycle_steps.go +++ b/internal/daemon/vm_lifecycle_steps.go @@ -10,6 +10,7 @@ import ( "banger/internal/firecracker" "banger/internal/imagepull" "banger/internal/model" + "banger/internal/roothelper" "banger/internal/system" ) @@ -40,7 +41,6 @@ type startContext struct { dmName string tapName string fcPath string - machine *firecracker.Machine // systemOverlayCreated records whether the system_overlay step // actually created the file (vs. the file existing from a crashed @@ -243,12 +243,7 @@ func (s *VMService) buildStartSteps(op *operationLog, sc *startContext) []startS // snapshot. Exit codes 0 + 1 are both "ok" here. name: "fsck_snapshot", run: func(ctx context.Context, sc *startContext) error { - if _, err := s.runner.RunSudo(ctx, "e2fsck", "-fy", sc.live.DMDev); err != nil { - if code := system.ExitCode(err); code < 0 || code > 1 { - return fmt.Errorf("fsck snapshot: %w", err) - } - } - return nil + return s.privOps().FsckSnapshot(ctx, sc.live.DMDev) }, }, { @@ -256,7 +251,7 @@ func (s *VMService) buildStartSteps(op *operationLog, sc *startContext) []startS createStage: "prepare_rootfs", createDetail: "writing guest configuration", run: func(ctx context.Context, sc *startContext) error { - return s.patchRootOverlay(ctx, *sc.vm, sc.image) + return s.patchRootOverlay(ctx, *sc.vm, sc.image, sc.live.DMDev) }, }, { @@ -307,8 +302,8 @@ func (s *VMService) buildStartSteps(op *operationLog, sc *startContext) []startS }, { name: "firecracker_binary", - run: func(_ context.Context, sc *startContext) error { - fcPath, err := s.net.firecrackerBinary() + run: func(ctx context.Context, sc *startContext) error { + fcPath, err := s.net.firecrackerBinary(ctx) if err != nil { return err } @@ -323,7 +318,7 @@ func (s *VMService) buildStartSteps(op *operationLog, sc *startContext) []startS createDetail: "starting firecracker", run: func(ctx context.Context, sc *startContext) error { kernelArgs := buildKernelArgs(*sc.vm, sc.image, s.config.BridgeIP, s.config.DefaultDNS) - machineConfig := firecracker.MachineConfig{ + launchReq := roothelper.FirecrackerLaunchRequest{ BinaryPath: sc.fcPath, VMID: sc.vm.ID, SocketPath: sc.apiSock, @@ -343,24 +338,15 @@ func (s *VMService) buildStartSteps(op *operationLog, sc *startContext) []startS VSockCID: sc.vm.Runtime.VSockCID, VCPUCount: sc.vm.Spec.VCPUCount, MemoryMiB: sc.vm.Spec.MemoryMiB, - Logger: s.logger, } + machineConfig := firecracker.MachineConfig{Drives: launchReq.Drives} s.capHooks.contributeMachine(&machineConfig, *sc.vm, sc.image) - machine, err := firecracker.NewMachine(ctx, machineConfig) + launchReq.Drives = machineConfig.Drives + pid, err := s.privOps().LaunchFirecracker(ctx, launchReq) if err != nil { return err } - sc.machine = machine - if err := machine.Start(ctx); err != nil { - // machine.Start can fail AFTER the firecracker process - // is already spawned (HTTP config phase). Record the - // PID so the undo can kill it; use a fresh ctx since - // the request ctx may be cancelled by now. - sc.live.PID = s.net.resolveFirecrackerPID(context.Background(), machine, sc.apiSock) - s.setVMHandles(sc.vm, *sc.live) - return err - } - sc.live.PID = s.net.resolveFirecrackerPID(context.Background(), machine, sc.apiSock) + sc.live.PID = pid s.setVMHandles(sc.vm, *sc.live) op.debugStage("firecracker_started", "pid", sc.live.PID) return nil diff --git a/internal/daemon/vm_service.go b/internal/daemon/vm_service.go index fdd7d95..d8db6a4 100644 --- a/internal/daemon/vm_service.go +++ b/internal/daemon/vm_service.go @@ -58,9 +58,10 @@ type VMService struct { // Peer services. VMService orchestrates across all three during // start/stop/delete; pointer fields keep call sites direct without // promoting the peer API to package-level interfaces. - net *HostNetwork - img *ImageService - ws *WorkspaceService + net *HostNetwork + img *ImageService + ws *WorkspaceService + priv privilegedOps // vsockHostDevice is the path preflight + doctor expect to find for // the vhost-vsock device. Defaults to defaultVsockHostDevice; tests @@ -101,6 +102,7 @@ type vmServiceDeps struct { net *HostNetwork img *ImageService ws *WorkspaceService + priv privilegedOps capHooks capabilityHooks beginOperation func(name string, attrs ...any) *operationLog vsockHostDevice string @@ -120,6 +122,7 @@ func newVMService(deps vmServiceDeps) *VMService { net: deps.net, img: deps.img, ws: deps.ws, + priv: deps.priv, capHooks: deps.capHooks, beginOperation: deps.beginOperation, vsockHostDevice: vsockPath, diff --git a/internal/daemon/vm_test.go b/internal/daemon/vm_test.go index bbd793a..868e5b0 100644 --- a/internal/daemon/vm_test.go +++ b/internal/daemon/vm_test.go @@ -427,8 +427,8 @@ func TestHealthVMReturnsHealthyForRunningGuest(t *testing.T) { runner := &scriptedRunner{ t: t, steps: []runnerStep{ - sudoStep("", nil, "chown", fmt.Sprintf("%d:%d", os.Getuid(), os.Getgid()), vsockSock), sudoStep("", nil, "chmod", "600", vsockSock), + sudoStep("", nil, "chown", fmt.Sprintf("%d:%d", os.Getuid(), os.Getgid()), vsockSock), }, } d := &Daemon{store: db, runner: runner} @@ -491,8 +491,8 @@ func TestPingVMAliasReturnsAliveForHealthyVM(t *testing.T) { runner := &scriptedRunner{ t: t, steps: []runnerStep{ - sudoStep("", nil, "chown", fmt.Sprintf("%d:%d", os.Getuid(), os.Getgid()), vsockSock), sudoStep("", nil, "chmod", "600", vsockSock), + sudoStep("", nil, "chown", fmt.Sprintf("%d:%d", os.Getuid(), os.Getgid()), vsockSock), }, } d := &Daemon{store: db, runner: runner} @@ -691,8 +691,8 @@ func TestPortsVMReturnsEnrichedPortsAndWebSchemes(t *testing.T) { runner := &scriptedRunner{ t: t, steps: []runnerStep{ - sudoStep("", nil, "chown", fmt.Sprintf("%d:%d", os.Getuid(), os.Getgid()), vsockSock), sudoStep("", nil, "chmod", "600", vsockSock), + sudoStep("", nil, "chown", fmt.Sprintf("%d:%d", os.Getuid(), os.Getgid()), vsockSock), }, } d := &Daemon{store: db, runner: runner} @@ -1148,13 +1148,92 @@ func TestRunFileSyncCopiesDirectoryRecursively(t *testing.T) { } } +func TestRunFileSyncAllowsTopLevelSymlinkWithinHome(t *testing.T) { + homeDir := t.TempDir() + t.Setenv("HOME", homeDir) + + targetDir := filepath.Join(homeDir, ".config", "gh") + if err := os.MkdirAll(targetDir, 0o755); err != nil { + t.Fatal(err) + } + targetPath := filepath.Join(targetDir, "hosts.yml") + if err := os.WriteFile(targetPath, []byte("github.com"), 0o600); err != nil { + t.Fatal(err) + } + linkPath := filepath.Join(homeDir, "gh-hosts.yml") + if err := os.Symlink(targetPath, linkPath); err != nil { + t.Skipf("symlink unsupported on this filesystem: %v", err) + } + + workDisk := t.TempDir() + d := &Daemon{ + runner: &filesystemRunner{t: t}, + config: model.DaemonConfig{ + HostHomeDir: homeDir, + FileSync: []model.FileSyncEntry{ + {Host: "~/gh-hosts.yml", Guest: "~/.config/gh/hosts.yml"}, + }, + }, + } + wireServices(d) + vm := testVM("sync-top-level-symlink-ok", "image", "172.16.0.77") + vm.Runtime.WorkDiskPath = workDisk + if err := d.ws.runFileSync(context.Background(), &vm); err != nil { + t.Fatalf("runFileSync: %v", err) + } + + got, err := os.ReadFile(filepath.Join(workDisk, ".config", "gh", "hosts.yml")) + if err != nil { + t.Fatal(err) + } + if string(got) != "github.com" { + t.Fatalf("guest file = %q, want github.com", got) + } +} + +func TestRunFileSyncRejectsTopLevelSymlinkOutsideHome(t *testing.T) { + homeDir := t.TempDir() + t.Setenv("HOME", homeDir) + + outsideDir := t.TempDir() + targetPath := filepath.Join(outsideDir, "secret.txt") + if err := os.WriteFile(targetPath, []byte("must-stay-outside"), 0o600); err != nil { + t.Fatal(err) + } + linkPath := filepath.Join(homeDir, "secret-link") + if err := os.Symlink(targetPath, linkPath); err != nil { + t.Skipf("symlink unsupported on this filesystem: %v", err) + } + + workDisk := t.TempDir() + d := &Daemon{ + runner: &filesystemRunner{t: t}, + config: model.DaemonConfig{ + HostHomeDir: homeDir, + FileSync: []model.FileSyncEntry{ + {Host: "~/secret-link", Guest: "~/secret.txt"}, + }, + }, + } + wireServices(d) + vm := testVM("sync-top-level-symlink-reject", "image", "172.16.0.78") + vm.Runtime.WorkDiskPath = workDisk + err := d.ws.runFileSync(context.Background(), &vm) + if err == nil || !strings.Contains(err.Error(), "owner home") { + t.Fatalf("runFileSync error = %v, want owner-home rejection", err) + } + if _, statErr := os.Stat(filepath.Join(workDisk, "secret.txt")); !os.IsNotExist(statErr) { + t.Fatalf("guest file exists after rejected sync (stat err = %v)", statErr) + } +} + // TestRunFileSyncSkipsNestedSymlinks pins the anti-sprawl contract: // a symlink INSIDE a synced directory is not followed, even if the // target holds real files. Without this, a user syncing ~/.aws with // a ~/.aws/session -> ~/other-creds symlink would copy the unrelated -// creds into the guest. Top-level entries (the path the user -// literally named) still follow, because they explicitly asked for -// that path. +// creds into the guest. Top-level entries are resolved separately: +// they may still follow, but only when the real target stays under +// the configured owner home. func TestRunFileSyncSkipsNestedSymlinks(t *testing.T) { homeDir := t.TempDir() t.Setenv("HOME", homeDir) @@ -1543,8 +1622,8 @@ func TestStopVMFallsBackToForcedCleanupAfterGracefulTimeout(t *testing.T) { scriptedRunner: &scriptedRunner{ t: t, steps: []runnerStep{ - sudoStep("", nil, "chown", fmt.Sprintf("%d:%d", os.Getuid(), os.Getgid()), apiSock), sudoStep("", nil, "chmod", "600", apiSock), + sudoStep("", nil, "chown", fmt.Sprintf("%d:%d", os.Getuid(), os.Getgid()), apiSock), {call: runnerCall{name: "pgrep", args: []string{"-n", "-f", apiSock}}, out: []byte(strconv.Itoa(fake.Process.Pid) + "\n")}, sudoStep("", nil, "kill", "-KILL", strconv.Itoa(fake.Process.Pid)), }, diff --git a/internal/firecracker/client.go b/internal/firecracker/client.go index 063404f..f54fd9f 100644 --- a/internal/firecracker/client.go +++ b/internal/firecracker/client.go @@ -202,6 +202,18 @@ func defaultDriveID(drive DriveConfig, fallback string) string { } func buildProcessRunner(cfg MachineConfig, logFile *os.File) *exec.Cmd { + if os.Geteuid() == 0 { + script := "umask 077 && exec " + shellQuote(cfg.BinaryPath) + + " --api-sock " + shellQuote(cfg.SocketPath) + + " --id " + shellQuote(cfg.VMID) + cmd := exec.Command("sh", "-c", script) + cmd.Stdin = nil + if logFile != nil { + cmd.Stdout = logFile + cmd.Stderr = logFile + } + return cmd + } // Two moving parts, run inside a single sudo'd shell: // // 1. umask 077 + exec firecracker → the API and vsock sockets diff --git a/internal/installmeta/installmeta.go b/internal/installmeta/installmeta.go new file mode 100644 index 0000000..e55678f --- /dev/null +++ b/internal/installmeta/installmeta.go @@ -0,0 +1,114 @@ +package installmeta + +import ( + "fmt" + "os" + "os/user" + "path/filepath" + "strconv" + "strings" + "time" + + toml "github.com/pelletier/go-toml" +) + +const ( + DefaultDir = "/etc/banger" + DefaultPath = DefaultDir + "/install.toml" + DefaultService = "bangerd.service" + DefaultRootHelperService = "bangerd-root.service" + DefaultSocketPath = "/run/banger/bangerd.sock" + DefaultRootHelperRuntimeDir = "/run/banger-root" + DefaultRootHelperSocketPath = DefaultRootHelperRuntimeDir + "/bangerd-root.sock" +) + +type Metadata struct { + OwnerUser string `toml:"owner_user"` + OwnerUID int `toml:"owner_uid"` + OwnerGID int `toml:"owner_gid"` + OwnerHome string `toml:"owner_home"` + InstalledAt time.Time `toml:"installed_at"` + Version string `toml:"version,omitempty"` + Commit string `toml:"commit,omitempty"` + BuiltAt string `toml:"built_at,omitempty"` +} + +func LookupOwner(name string) (Metadata, error) { + name = strings.TrimSpace(name) + if name == "" { + return Metadata{}, fmt.Errorf("owner username is required") + } + entry, err := user.Lookup(name) + if err != nil { + return Metadata{}, err + } + uid, err := strconv.Atoi(entry.Uid) + if err != nil { + return Metadata{}, fmt.Errorf("parse owner uid %q: %w", entry.Uid, err) + } + gid, err := strconv.Atoi(entry.Gid) + if err != nil { + return Metadata{}, fmt.Errorf("parse owner gid %q: %w", entry.Gid, err) + } + home := strings.TrimSpace(entry.HomeDir) + if home == "" || !filepath.IsAbs(home) { + return Metadata{}, fmt.Errorf("owner %q has invalid home directory %q", name, entry.HomeDir) + } + return Metadata{ + OwnerUser: name, + OwnerUID: uid, + OwnerGID: gid, + OwnerHome: home, + }, nil +} + +func Load(path string) (Metadata, error) { + if strings.TrimSpace(path) == "" { + path = DefaultPath + } + data, err := os.ReadFile(path) + if err != nil { + return Metadata{}, err + } + var meta Metadata + if err := toml.Unmarshal(data, &meta); err != nil { + return Metadata{}, err + } + if err := meta.Validate(); err != nil { + return Metadata{}, err + } + return meta, nil +} + +func Save(path string, meta Metadata) error { + if strings.TrimSpace(path) == "" { + path = DefaultPath + } + if err := meta.Validate(); err != nil { + return err + } + if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { + return err + } + data, err := toml.Marshal(meta) + if err != nil { + return err + } + return os.WriteFile(path, data, 0o644) +} + +func (m Metadata) Validate() error { + if strings.TrimSpace(m.OwnerUser) == "" { + return fmt.Errorf("install metadata missing owner_user") + } + if m.OwnerUID < 0 { + return fmt.Errorf("install metadata has invalid owner_uid %d", m.OwnerUID) + } + if m.OwnerGID < 0 { + return fmt.Errorf("install metadata has invalid owner_gid %d", m.OwnerGID) + } + if strings.TrimSpace(m.OwnerHome) == "" || !filepath.IsAbs(m.OwnerHome) { + return fmt.Errorf("install metadata has invalid owner_home %q", m.OwnerHome) + } + return nil +} diff --git a/internal/installmeta/installmeta_test.go b/internal/installmeta/installmeta_test.go new file mode 100644 index 0000000..3901d88 --- /dev/null +++ b/internal/installmeta/installmeta_test.go @@ -0,0 +1,39 @@ +package installmeta + +import ( + "path/filepath" + "testing" + "time" +) + +func TestSaveLoadRoundTrip(t *testing.T) { + path := filepath.Join(t.TempDir(), "install.toml") + want := Metadata{ + OwnerUser: "dev", + OwnerUID: 1000, + OwnerGID: 1000, + OwnerHome: "/home/dev", + InstalledAt: time.Unix(1710000000, 0).UTC(), + Version: "v1.2.3", + Commit: "abc123", + BuiltAt: "2026-04-23T00:00:00Z", + } + + if err := Save(path, want); err != nil { + t.Fatalf("Save: %v", err) + } + got, err := Load(path) + if err != nil { + t.Fatalf("Load: %v", err) + } + if got != want { + t.Fatalf("Load() = %+v, want %+v", got, want) + } +} + +func TestValidateRejectsMissingOwner(t *testing.T) { + err := Metadata{OwnerUID: 1000, OwnerGID: 1000, OwnerHome: "/home/dev"}.Validate() + if err == nil { + t.Fatal("Validate() = nil, want missing owner_user error") + } +} diff --git a/internal/model/types.go b/internal/model/types.go index 49f295f..fcc8744 100644 --- a/internal/model/types.go +++ b/internal/model/types.go @@ -37,6 +37,7 @@ type DaemonConfig struct { LogLevel string FirecrackerBin string SSHKeyPath string + HostHomeDir string AutoStopStaleAfter time.Duration StatsPollInterval time.Duration BridgeName string @@ -51,11 +52,12 @@ type DaemonConfig struct { // FileSyncEntry is a user-declared host→guest file or directory copy // applied to each VM's work disk at vm create time. Host is expanded -// against the host user's $HOME for "~/..."; Guest is expanded -// against /root (banger VMs are single-user root). If the host path -// is a directory, it's copied recursively; if it's a file, it's -// copied as a file. Missing host paths are a soft skip (warned, not -// fatal). Mode defaults to 0600 for files and 0755 for directories. +// against the configured owner home for "~/..." and must stay within +// that home; Guest is expanded against /root (banger VMs are +// single-user root). If the host path is a directory, it's copied +// recursively; if it's a file, it's copied as a file. Missing host +// paths are a soft skip (warned, not fatal). Mode defaults to 0600 +// for files and 0755 for directories. type FileSyncEntry struct { Host string Guest string diff --git a/internal/paths/layout_test.go b/internal/paths/layout_test.go index acb5328..9a15b5d 100644 --- a/internal/paths/layout_test.go +++ b/internal/paths/layout_test.go @@ -38,6 +38,36 @@ func TestResolveUsesXDGOverrides(t *testing.T) { } } +func TestResolveUserForHomeUsesProvidedHome(t *testing.T) { + home := filepath.Join(t.TempDir(), "owner") + layout, err := ResolveUserForHome(home) + if err != nil { + t.Fatalf("ResolveUserForHome: %v", err) + } + if layout.ConfigDir != filepath.Join(home, ".config", "banger") { + t.Fatalf("ConfigDir = %q", layout.ConfigDir) + } + if layout.StateDir != filepath.Join(home, ".local", "state", "banger") { + t.Fatalf("StateDir = %q", layout.StateDir) + } + if layout.KnownHostsPath != filepath.Join(home, ".local", "state", "banger", "ssh", "known_hosts") { + t.Fatalf("KnownHostsPath = %q", layout.KnownHostsPath) + } +} + +func TestResolveSystemUsesFixedPaths(t *testing.T) { + layout := ResolveSystem() + if layout.SocketPath != "/run/banger/bangerd.sock" { + t.Fatalf("SocketPath = %q", layout.SocketPath) + } + if layout.StateDir != "/var/lib/banger" { + t.Fatalf("StateDir = %q", layout.StateDir) + } + if layout.KnownHostsPath != "/var/lib/banger/ssh/known_hosts" { + t.Fatalf("KnownHostsPath = %q", layout.KnownHostsPath) + } +} + func TestResolveFallsBackWhenRuntimeUnset(t *testing.T) { t.Setenv("XDG_RUNTIME_DIR", "") layout, err := Resolve() diff --git a/internal/paths/paths.go b/internal/paths/paths.go index 9cdc455..25afbdc 100644 --- a/internal/paths/paths.go +++ b/internal/paths/paths.go @@ -4,9 +4,12 @@ import ( "errors" "fmt" "os" + "os/user" "path/filepath" "strings" "syscall" + + "banger/internal/installmeta" ) type Layout struct { @@ -37,6 +40,10 @@ type Layout struct { } func Resolve() (Layout, error) { + return ResolveUser() +} + +func ResolveUser() (Layout, error) { home, err := os.UserHomeDir() if err != nil { return Layout{}, err @@ -74,6 +81,52 @@ func Resolve() (Layout, error) { return layout, nil } +func ResolveUserForHome(home string) (Layout, error) { + home = strings.TrimSpace(home) + if home == "" { + return Layout{}, errors.New("home directory is required") + } + if !filepath.IsAbs(home) { + return Layout{}, fmt.Errorf("home directory %q must be absolute", home) + } + configHome := filepath.Join(home, ".config") + stateHome := filepath.Join(home, ".local", "state") + cacheHome := filepath.Join(home, ".cache") + layout := Layout{ + ConfigHome: configHome, + StateHome: stateHome, + CacheHome: cacheHome, + ConfigDir: filepath.Join(configHome, "banger"), + StateDir: filepath.Join(stateHome, "banger"), + CacheDir: filepath.Join(cacheHome, "banger"), + SSHDir: filepath.Join(stateHome, "banger", "ssh"), + } + layout.KnownHostsPath = filepath.Join(layout.SSHDir, "known_hosts") + return layout, nil +} + +func ResolveSystem() Layout { + layout := Layout{ + ConfigHome: "/etc", + StateHome: "/var/lib", + CacheHome: "/var/cache", + RuntimeHome: "/run", + ConfigDir: installmeta.DefaultDir, + StateDir: "/var/lib/banger", + CacheDir: "/var/cache/banger", + RuntimeDir: "/run/banger", + } + layout.SocketPath = installmeta.DefaultSocketPath + layout.DBPath = filepath.Join(layout.StateDir, "state.db") + layout.VMsDir = filepath.Join(layout.StateDir, "vms") + layout.ImagesDir = filepath.Join(layout.StateDir, "images") + layout.KernelsDir = filepath.Join(layout.StateDir, "kernels") + layout.OCICacheDir = filepath.Join(layout.CacheDir, "oci") + layout.SSHDir = filepath.Join(layout.StateDir, "ssh") + layout.KnownHostsPath = filepath.Join(layout.SSHDir, "known_hosts") + return layout +} + func Ensure(layout Layout) error { // When we're using the /tmp fallback, we must create and own the // runtime-home parent ourselves and reject any pre-existing directory @@ -117,6 +170,53 @@ func Ensure(layout Layout) error { return nil } +func EnsureSystem(layout Layout) error { + if strings.TrimSpace(layout.ConfigDir) != "" { + if err := os.MkdirAll(layout.ConfigDir, 0o755); err != nil { + return err + } + } + for _, dir := range []string{layout.StateDir, layout.CacheDir, layout.VMsDir, layout.ImagesDir, layout.KernelsDir, layout.OCICacheDir, layout.SSHDir} { + if strings.TrimSpace(dir) == "" { + continue + } + if err := os.MkdirAll(dir, 0o700); err != nil { + return err + } + if err := os.Chmod(dir, 0o700); err != nil { + return err + } + } + if strings.TrimSpace(layout.RuntimeDir) != "" { + if err := os.MkdirAll(layout.RuntimeDir, 0o711); err != nil { + return err + } + if err := os.Chmod(layout.RuntimeDir, 0o711); err != nil { + return err + } + } + return nil +} + +// EnsureSystemOwned prepares the systemd-managed directories the +// owner-user daemon needs once systemd has already created the top-level +// state/cache/runtime roots on its behalf. Unlike EnsureSystem, it does +// not touch /etc/banger and it never assumes root ownership. +func EnsureSystemOwned(layout Layout) error { + for _, dir := range []string{layout.StateDir, layout.CacheDir, layout.RuntimeDir, layout.VMsDir, layout.ImagesDir, layout.KernelsDir, layout.OCICacheDir, layout.SSHDir} { + if strings.TrimSpace(dir) == "" { + continue + } + if err := os.MkdirAll(dir, 0o700); err != nil { + return err + } + if err := os.Chmod(dir, 0o700); err != nil { + return err + } + } + return nil +} + // ensureSafeRuntimeHome creates path at 0700 if missing, or validates // existing ownership + mode. Returns an error describing how to remediate // when the existing directory doesn't meet the bar. @@ -169,6 +269,21 @@ func BangerdPath() (string, error) { return "", errors.New("bangerd binary not found next to banger; run `make build`") } +func BangerPath() (string, error) { + if env := os.Getenv("BANGER_BIN"); env != "" { + return env, nil + } + return executablePath() +} + +func CurrentUsername() (string, error) { + entry, err := user.Current() + if err != nil { + return "", err + } + return entry.Username, nil +} + func CompanionBinaryPath(name string) (string, error) { envNames := []string{ "BANGER_" + strings.ToUpper(strings.NewReplacer("-", "_", ".", "_").Replace(name)) + "_BIN", diff --git a/internal/roothelper/roothelper.go b/internal/roothelper/roothelper.go new file mode 100644 index 0000000..09bf4bd --- /dev/null +++ b/internal/roothelper/roothelper.go @@ -0,0 +1,840 @@ +package roothelper + +import ( + "bufio" + "context" + "encoding/json" + "errors" + "fmt" + "log/slog" + "net" + "os" + "path/filepath" + "strconv" + "strings" + "syscall" + "time" + + "golang.org/x/sys/unix" + + "banger/internal/daemon/dmsnap" + "banger/internal/daemon/fcproc" + "banger/internal/firecracker" + "banger/internal/hostnat" + "banger/internal/installmeta" + "banger/internal/paths" + "banger/internal/rpc" + "banger/internal/system" +) + +const ( + methodEnsureBridge = "priv.ensure_bridge" + methodCreateTap = "priv.create_tap" + methodDeleteTap = "priv.delete_tap" + methodSyncResolverRouting = "priv.sync_resolver_routing" + methodClearResolverRouting = "priv.clear_resolver_routing" + methodEnsureNAT = "priv.ensure_nat" + methodCreateDMSnapshot = "priv.create_dm_snapshot" + methodCleanupDMSnapshot = "priv.cleanup_dm_snapshot" + methodRemoveDMSnapshot = "priv.remove_dm_snapshot" + methodFsckSnapshot = "priv.fsck_snapshot" + methodReadExt4File = "priv.read_ext4_file" + methodWriteExt4Files = "priv.write_ext4_files" + methodResolveFirecrackerBin = "priv.resolve_firecracker_binary" + methodLaunchFirecracker = "priv.launch_firecracker" + methodEnsureSocketAccess = "priv.ensure_socket_access" + methodFindFirecrackerPID = "priv.find_firecracker_pid" + methodKillProcess = "priv.kill_process" + methodSignalProcess = "priv.signal_process" + methodProcessRunning = "priv.process_running" + rootfsDMNamePrefix = "fc-rootfs-" + vmTapPrefix = "tap-fc-" + tapPoolPrefix = "tap-pool-" + vmResolverRouteDomain = "~vm" + defaultFirecrackerBinaryName = "firecracker" +) + +type NetworkConfig struct { + BridgeName string `json:"bridge_name"` + BridgeIP string `json:"bridge_ip"` + CIDR string `json:"cidr"` +} + +type Ext4Write struct { + GuestPath string `json:"guest_path"` + Data []byte `json:"data"` + Mode uint32 `json:"mode"` +} + +type FirecrackerLaunchRequest struct { + BinaryPath string `json:"binary_path"` + VMID string `json:"vm_id"` + SocketPath string `json:"socket_path"` + LogPath string `json:"log_path"` + MetricsPath string `json:"metrics_path"` + KernelImagePath string `json:"kernel_image_path"` + InitrdPath string `json:"initrd_path,omitempty"` + KernelArgs string `json:"kernel_args"` + Drives []firecracker.DriveConfig `json:"drives"` + TapDevice string `json:"tap_device"` + VSockPath string `json:"vsock_path"` + VSockCID uint32 `json:"vsock_cid"` + VCPUCount int `json:"vcpu_count"` + MemoryMiB int `json:"memory_mib"` + Network NetworkConfig `json:"network"` +} + +type findPIDResult struct { + PID int `json:"pid"` +} + +type processRunningResult struct { + Running bool `json:"running"` +} + +type readExt4FileResult struct { + Data []byte `json:"data"` +} + +type resolveFirecrackerResult struct { + Path string `json:"path"` +} + +type launchFirecrackerResult struct { + PID int `json:"pid"` +} + +type Client struct { + socketPath string +} + +func NewClient(socketPath string) *Client { + return &Client{socketPath: strings.TrimSpace(socketPath)} +} + +func (c *Client) EnsureBridge(ctx context.Context, cfg NetworkConfig) error { + _, err := rpc.Call[struct{}](ctx, c.socketPath, methodEnsureBridge, cfg) + return err +} + +func (c *Client) CreateTap(ctx context.Context, cfg NetworkConfig, tapName string) error { + _, err := rpc.Call[struct{}](ctx, c.socketPath, methodCreateTap, struct { + NetworkConfig + TapName string `json:"tap_name"` + }{NetworkConfig: cfg, TapName: tapName}) + return err +} + +func (c *Client) DeleteTap(ctx context.Context, tapName string) error { + _, err := rpc.Call[struct{}](ctx, c.socketPath, methodDeleteTap, struct { + TapName string `json:"tap_name"` + }{TapName: tapName}) + return err +} + +func (c *Client) SyncResolverRouting(ctx context.Context, bridgeName, serverAddr string) error { + _, err := rpc.Call[struct{}](ctx, c.socketPath, methodSyncResolverRouting, struct { + BridgeName string `json:"bridge_name"` + ServerAddr string `json:"server_addr"` + }{BridgeName: bridgeName, ServerAddr: serverAddr}) + return err +} + +func (c *Client) ClearResolverRouting(ctx context.Context, bridgeName string) error { + _, err := rpc.Call[struct{}](ctx, c.socketPath, methodClearResolverRouting, struct { + BridgeName string `json:"bridge_name"` + }{BridgeName: bridgeName}) + return err +} + +func (c *Client) EnsureNAT(ctx context.Context, guestIP, tap string, enable bool) error { + _, err := rpc.Call[struct{}](ctx, c.socketPath, methodEnsureNAT, struct { + GuestIP string `json:"guest_ip"` + Tap string `json:"tap"` + Enable bool `json:"enable"` + }{GuestIP: guestIP, Tap: tap, Enable: enable}) + return err +} + +func (c *Client) CreateDMSnapshot(ctx context.Context, rootfsPath, cowPath, dmName string) (dmsnap.Handles, error) { + return rpc.Call[dmsnap.Handles](ctx, c.socketPath, methodCreateDMSnapshot, struct { + RootfsPath string `json:"rootfs_path"` + COWPath string `json:"cow_path"` + DMName string `json:"dm_name"` + }{RootfsPath: rootfsPath, COWPath: cowPath, DMName: dmName}) +} + +func (c *Client) CleanupDMSnapshot(ctx context.Context, handles dmsnap.Handles) error { + _, err := rpc.Call[struct{}](ctx, c.socketPath, methodCleanupDMSnapshot, handles) + return err +} + +func (c *Client) RemoveDMSnapshot(ctx context.Context, target string) error { + _, err := rpc.Call[struct{}](ctx, c.socketPath, methodRemoveDMSnapshot, struct { + Target string `json:"target"` + }{Target: target}) + return err +} + +func (c *Client) FsckSnapshot(ctx context.Context, dmDev string) error { + _, err := rpc.Call[struct{}](ctx, c.socketPath, methodFsckSnapshot, struct { + DMDev string `json:"dm_dev"` + }{DMDev: dmDev}) + return err +} + +func (c *Client) ReadExt4File(ctx context.Context, imagePath, guestPath string) ([]byte, error) { + result, err := rpc.Call[readExt4FileResult](ctx, c.socketPath, methodReadExt4File, struct { + ImagePath string `json:"image_path"` + GuestPath string `json:"guest_path"` + }{ImagePath: imagePath, GuestPath: guestPath}) + if err != nil { + return nil, err + } + return result.Data, nil +} + +func (c *Client) WriteExt4Files(ctx context.Context, imagePath string, files []Ext4Write) error { + _, err := rpc.Call[struct{}](ctx, c.socketPath, methodWriteExt4Files, struct { + ImagePath string `json:"image_path"` + Files []Ext4Write `json:"files"` + }{ImagePath: imagePath, Files: files}) + return err +} + +func (c *Client) ResolveFirecrackerBinary(ctx context.Context, requested string) (string, error) { + result, err := rpc.Call[resolveFirecrackerResult](ctx, c.socketPath, methodResolveFirecrackerBin, struct { + Requested string `json:"requested"` + }{Requested: requested}) + if err != nil { + return "", err + } + return result.Path, nil +} + +func (c *Client) LaunchFirecracker(ctx context.Context, req FirecrackerLaunchRequest) (int, error) { + result, err := rpc.Call[launchFirecrackerResult](ctx, c.socketPath, methodLaunchFirecracker, req) + if err != nil { + return 0, err + } + return result.PID, nil +} + +func (c *Client) EnsureSocketAccess(ctx context.Context, socketPath, label string) error { + _, err := rpc.Call[struct{}](ctx, c.socketPath, methodEnsureSocketAccess, struct { + SocketPath string `json:"socket_path"` + Label string `json:"label"` + }{SocketPath: socketPath, Label: label}) + return err +} + +func (c *Client) FindFirecrackerPID(ctx context.Context, apiSock string) (int, error) { + result, err := rpc.Call[findPIDResult](ctx, c.socketPath, methodFindFirecrackerPID, struct { + APISock string `json:"api_sock"` + }{APISock: apiSock}) + if err != nil { + return 0, err + } + return result.PID, nil +} + +func (c *Client) KillProcess(ctx context.Context, pid int) error { + _, err := rpc.Call[struct{}](ctx, c.socketPath, methodKillProcess, struct { + PID int `json:"pid"` + }{PID: pid}) + return err +} + +func (c *Client) SignalProcess(ctx context.Context, pid int, signal string) error { + _, err := rpc.Call[struct{}](ctx, c.socketPath, methodSignalProcess, struct { + PID int `json:"pid"` + Signal string `json:"signal"` + }{PID: pid, Signal: signal}) + return err +} + +func (c *Client) ProcessRunning(ctx context.Context, pid int, apiSock string) (bool, error) { + result, err := rpc.Call[processRunningResult](ctx, c.socketPath, methodProcessRunning, struct { + PID int `json:"pid"` + APISock string `json:"api_sock"` + }{PID: pid, APISock: apiSock}) + if err != nil { + return false, err + } + return result.Running, nil +} + +type Server struct { + meta installmeta.Metadata + runner system.CommandRunner + logger *slog.Logger + listener net.Listener +} + +func Open() (*Server, error) { + meta, err := installmeta.Load(installmeta.DefaultPath) + if err != nil { + return nil, err + } + if err := os.MkdirAll(installmeta.DefaultRootHelperRuntimeDir, 0o711); err != nil { + return nil, err + } + if err := os.Chmod(installmeta.DefaultRootHelperRuntimeDir, 0o711); err != nil { + return nil, err + } + return &Server{ + meta: meta, + runner: system.NewRunner(), + logger: slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelInfo})), + }, nil +} + +func (s *Server) Close() error { + if s == nil || s.listener == nil { + return nil + } + return s.listener.Close() +} + +func (s *Server) Serve(ctx context.Context) error { + _ = os.Remove(installmeta.DefaultRootHelperSocketPath) + listener, err := net.Listen("unix", installmeta.DefaultRootHelperSocketPath) + if err != nil { + return err + } + s.listener = listener + defer listener.Close() + defer os.Remove(installmeta.DefaultRootHelperSocketPath) + if err := os.Chmod(installmeta.DefaultRootHelperSocketPath, 0o600); err != nil { + return err + } + if err := os.Chown(installmeta.DefaultRootHelperSocketPath, s.meta.OwnerUID, s.meta.OwnerGID); err != nil { + return err + } + + done := make(chan struct{}) + defer close(done) + go func() { + select { + case <-ctx.Done(): + _ = listener.Close() + case <-done: + } + }() + + for { + conn, err := listener.Accept() + if err != nil { + select { + case <-ctx.Done(): + return nil + default: + } + var netErr net.Error + if errors.As(err, &netErr) && netErr.Temporary() { + time.Sleep(100 * time.Millisecond) + continue + } + return err + } + go s.handleConn(conn) + } +} + +func (s *Server) handleConn(conn net.Conn) { + defer conn.Close() + if err := s.authorizeConn(conn); err != nil { + _ = json.NewEncoder(conn).Encode(rpc.NewError("unauthorized", err.Error())) + return + } + var req rpc.Request + if err := json.NewDecoder(bufio.NewReader(conn)).Decode(&req); err != nil { + _ = json.NewEncoder(conn).Encode(rpc.NewError("bad_request", err.Error())) + return + } + resp := s.dispatch(context.Background(), req) + _ = json.NewEncoder(conn).Encode(resp) +} + +func (s *Server) authorizeConn(conn net.Conn) error { + unixConn, ok := conn.(*net.UnixConn) + if !ok { + return errors.New("root helper requires unix connections") + } + rawConn, err := unixConn.SyscallConn() + if err != nil { + return err + } + var cred *unix.Ucred + var controlErr error + if err := rawConn.Control(func(fd uintptr) { + cred, controlErr = unix.GetsockoptUcred(int(fd), unix.SOL_SOCKET, unix.SO_PEERCRED) + }); err != nil { + return err + } + if controlErr != nil { + return controlErr + } + if cred == nil { + return errors.New("missing peer credentials") + } + if int(cred.Uid) == 0 || int(cred.Uid) == s.meta.OwnerUID { + return nil + } + return fmt.Errorf("uid %d is not allowed to use the root helper", cred.Uid) +} + +func (s *Server) dispatch(ctx context.Context, req rpc.Request) rpc.Response { + switch req.Method { + case methodEnsureBridge: + params, err := rpc.DecodeParams[NetworkConfig](req) + if err != nil { + return rpc.NewError("bad_params", err.Error()) + } + return marshalResultOrError(struct{}{}, s.ensureBridge(ctx, params)) + case methodCreateTap: + params, err := rpc.DecodeParams[struct { + NetworkConfig + TapName string `json:"tap_name"` + }](req) + if err != nil { + return rpc.NewError("bad_params", err.Error()) + } + return marshalResultOrError(struct{}{}, s.createTap(ctx, params.NetworkConfig, params.TapName)) + case methodDeleteTap: + params, err := rpc.DecodeParams[struct { + TapName string `json:"tap_name"` + }](req) + if err != nil { + return rpc.NewError("bad_params", err.Error()) + } + return marshalResultOrError(struct{}{}, s.deleteTap(ctx, params.TapName)) + case methodSyncResolverRouting: + params, err := rpc.DecodeParams[struct { + BridgeName string `json:"bridge_name"` + ServerAddr string `json:"server_addr"` + }](req) + if err != nil { + return rpc.NewError("bad_params", err.Error()) + } + return marshalResultOrError(struct{}{}, s.syncResolverRouting(ctx, params.BridgeName, params.ServerAddr)) + case methodClearResolverRouting: + params, err := rpc.DecodeParams[struct { + BridgeName string `json:"bridge_name"` + }](req) + if err != nil { + return rpc.NewError("bad_params", err.Error()) + } + return marshalResultOrError(struct{}{}, s.clearResolverRouting(ctx, params.BridgeName)) + case methodEnsureNAT: + params, err := rpc.DecodeParams[struct { + GuestIP string `json:"guest_ip"` + Tap string `json:"tap"` + Enable bool `json:"enable"` + }](req) + if err != nil { + return rpc.NewError("bad_params", err.Error()) + } + return marshalResultOrError(struct{}{}, hostnat.Ensure(ctx, s.runner, params.GuestIP, params.Tap, params.Enable)) + case methodCreateDMSnapshot: + params, err := rpc.DecodeParams[struct { + RootfsPath string `json:"rootfs_path"` + COWPath string `json:"cow_path"` + DMName string `json:"dm_name"` + }](req) + if err != nil { + return rpc.NewError("bad_params", err.Error()) + } + if err := s.validateManagedPath(params.RootfsPath, paths.ResolveSystem().StateDir); err != nil { + return rpc.NewError("bad_params", err.Error()) + } + if err := s.validateManagedPath(params.COWPath, paths.ResolveSystem().StateDir); err != nil { + return rpc.NewError("bad_params", err.Error()) + } + if err := validateDMName(params.DMName); err != nil { + return rpc.NewError("bad_params", err.Error()) + } + result, err := dmsnap.Create(ctx, s.runner, params.RootfsPath, params.COWPath, params.DMName) + return marshalResultOrError(result, err) + case methodCleanupDMSnapshot: + params, err := rpc.DecodeParams[dmsnap.Handles](req) + if err != nil { + return rpc.NewError("bad_params", err.Error()) + } + return marshalResultOrError(struct{}{}, dmsnap.Cleanup(ctx, s.runner, params)) + case methodRemoveDMSnapshot: + params, err := rpc.DecodeParams[struct { + Target string `json:"target"` + }](req) + if err != nil { + return rpc.NewError("bad_params", err.Error()) + } + return marshalResultOrError(struct{}{}, dmsnap.Remove(ctx, s.runner, params.Target)) + case methodFsckSnapshot: + params, err := rpc.DecodeParams[struct { + DMDev string `json:"dm_dev"` + }](req) + if err != nil { + return rpc.NewError("bad_params", err.Error()) + } + return marshalResultOrError(struct{}{}, s.fsckSnapshot(ctx, params.DMDev)) + case methodReadExt4File: + params, err := rpc.DecodeParams[struct { + ImagePath string `json:"image_path"` + GuestPath string `json:"guest_path"` + }](req) + if err != nil { + return rpc.NewError("bad_params", err.Error()) + } + data, readErr := system.ReadExt4File(ctx, s.runner, params.ImagePath, params.GuestPath) + return marshalResultOrError(readExt4FileResult{Data: data}, readErr) + case methodWriteExt4Files: + params, err := rpc.DecodeParams[struct { + ImagePath string `json:"image_path"` + Files []Ext4Write `json:"files"` + }](req) + if err != nil { + return rpc.NewError("bad_params", err.Error()) + } + return marshalResultOrError(struct{}{}, s.writeExt4Files(ctx, params.ImagePath, params.Files)) + case methodResolveFirecrackerBin: + params, err := rpc.DecodeParams[struct { + Requested string `json:"requested"` + }](req) + if err != nil { + return rpc.NewError("bad_params", err.Error()) + } + path, resolveErr := s.resolveFirecrackerBinary(params.Requested) + return marshalResultOrError(resolveFirecrackerResult{Path: path}, resolveErr) + case methodLaunchFirecracker: + params, err := rpc.DecodeParams[FirecrackerLaunchRequest](req) + if err != nil { + return rpc.NewError("bad_params", err.Error()) + } + pid, launchErr := s.launchFirecracker(ctx, params) + return marshalResultOrError(launchFirecrackerResult{PID: pid}, launchErr) + case methodEnsureSocketAccess: + params, err := rpc.DecodeParams[struct { + SocketPath string `json:"socket_path"` + Label string `json:"label"` + }](req) + if err != nil { + return rpc.NewError("bad_params", err.Error()) + } + return marshalResultOrError(struct{}{}, s.ensureSocketAccess(ctx, params.SocketPath, params.Label)) + case methodFindFirecrackerPID: + params, err := rpc.DecodeParams[struct { + APISock string `json:"api_sock"` + }](req) + if err != nil { + return rpc.NewError("bad_params", err.Error()) + } + pid, findErr := fcproc.New(s.runner, fcproc.Config{}, s.logger).FindPID(ctx, params.APISock) + return marshalResultOrError(findPIDResult{PID: pid}, findErr) + case methodKillProcess: + params, err := rpc.DecodeParams[struct { + PID int `json:"pid"` + }](req) + if err != nil { + return rpc.NewError("bad_params", err.Error()) + } + _, killErr := s.runner.Run(ctx, "kill", "-KILL", strconv.Itoa(params.PID)) + return marshalResultOrError(struct{}{}, killErr) + case methodSignalProcess: + params, err := rpc.DecodeParams[struct { + PID int `json:"pid"` + Signal string `json:"signal"` + }](req) + if err != nil { + return rpc.NewError("bad_params", err.Error()) + } + signal := strings.TrimSpace(params.Signal) + if signal == "" { + signal = "TERM" + } + _, signalErr := s.runner.Run(ctx, "kill", "-"+signal, strconv.Itoa(params.PID)) + return marshalResultOrError(struct{}{}, signalErr) + case methodProcessRunning: + params, err := rpc.DecodeParams[struct { + PID int `json:"pid"` + APISock string `json:"api_sock"` + }](req) + if err != nil { + return rpc.NewError("bad_params", err.Error()) + } + return marshalResultOrError(processRunningResult{Running: system.ProcessRunning(params.PID, params.APISock)}, nil) + default: + return rpc.NewError("unknown_method", req.Method) + } +} + +func (s *Server) ensureBridge(ctx context.Context, cfg NetworkConfig) error { + return fcproc.New(s.runner, fcproc.Config{ + BridgeName: cfg.BridgeName, + BridgeIP: cfg.BridgeIP, + CIDR: cfg.CIDR, + }, s.logger).EnsureBridge(ctx) +} + +func (s *Server) createTap(ctx context.Context, cfg NetworkConfig, tapName string) error { + if err := validateTapName(tapName); err != nil { + return err + } + return fcproc.New(s.runner, fcproc.Config{ + BridgeName: cfg.BridgeName, + BridgeIP: cfg.BridgeIP, + CIDR: cfg.CIDR, + }, s.logger).CreateTapOwned(ctx, tapName, s.meta.OwnerUID, s.meta.OwnerGID) +} + +func (s *Server) deleteTap(ctx context.Context, tapName string) error { + if err := validateTapName(tapName); err != nil { + return err + } + _, err := s.runner.Run(ctx, "ip", "link", "del", tapName) + return err +} + +func (s *Server) syncResolverRouting(ctx context.Context, bridgeName, serverAddr string) error { + if strings.TrimSpace(bridgeName) == "" || strings.TrimSpace(serverAddr) == "" { + return nil + } + if _, err := system.LookupExecutable("resolvectl"); err != nil { + return nil + } + if _, err := s.runner.Run(ctx, "resolvectl", "dns", bridgeName, serverAddr); err != nil { + return err + } + if _, err := s.runner.Run(ctx, "resolvectl", "domain", bridgeName, vmResolverRouteDomain); err != nil { + return err + } + _, err := s.runner.Run(ctx, "resolvectl", "default-route", bridgeName, "no") + return err +} + +func (s *Server) clearResolverRouting(ctx context.Context, bridgeName string) error { + if strings.TrimSpace(bridgeName) == "" { + return nil + } + if _, err := system.LookupExecutable("resolvectl"); err != nil { + return nil + } + _, err := s.runner.Run(ctx, "resolvectl", "revert", bridgeName) + return err +} + +func (s *Server) fsckSnapshot(ctx context.Context, dmDev string) error { + if strings.TrimSpace(dmDev) == "" { + return errors.New("dm device is required") + } + if _, err := s.runner.Run(ctx, "e2fsck", "-fy", dmDev); err != nil { + if code := system.ExitCode(err); code < 0 || code > 1 { + return fmt.Errorf("fsck snapshot: %w", err) + } + } + return nil +} + +func (s *Server) writeExt4Files(ctx context.Context, imagePath string, files []Ext4Write) error { + for _, file := range files { + mode := os.FileMode(file.Mode) + if mode == 0 { + mode = 0o644 + } + if err := system.WriteExt4FileOwned(ctx, s.runner, imagePath, file.GuestPath, mode, 0, 0, file.Data); err != nil { + return err + } + } + return nil +} + +func (s *Server) resolveFirecrackerBinary(requested string) (string, error) { + requested = strings.TrimSpace(requested) + if requested == "" { + requested = defaultFirecrackerBinaryName + } + cfg := fcproc.Config{FirecrackerBin: requested} + resolved, err := fcproc.New(s.runner, cfg, s.logger).ResolveBinary() + if err != nil { + return "", err + } + if err := validateRootExecutable(resolved); err != nil { + return "", err + } + return resolved, nil +} + +func (s *Server) launchFirecracker(ctx context.Context, req FirecrackerLaunchRequest) (int, error) { + systemLayout := paths.ResolveSystem() + for _, path := range []string{req.SocketPath, req.VSockPath} { + if err := s.validateManagedPath(path, systemLayout.RuntimeDir); err != nil { + return 0, err + } + } + for _, path := range []string{req.LogPath, req.MetricsPath, req.KernelImagePath} { + if err := s.validateManagedPath(path, systemLayout.StateDir); err != nil { + return 0, err + } + } + if strings.TrimSpace(req.InitrdPath) != "" { + if err := s.validateManagedPath(req.InitrdPath, systemLayout.StateDir); err != nil { + return 0, err + } + } + if err := validateTapName(req.TapDevice); err != nil { + return 0, err + } + if err := validateRootExecutable(req.BinaryPath); err != nil { + return 0, err + } + for _, drive := range req.Drives { + if err := s.validateLaunchDrivePath(drive, systemLayout.StateDir); err != nil { + return 0, err + } + } + machine, err := firecracker.NewMachine(ctx, firecracker.MachineConfig{ + BinaryPath: req.BinaryPath, + VMID: req.VMID, + SocketPath: req.SocketPath, + LogPath: req.LogPath, + MetricsPath: req.MetricsPath, + KernelImagePath: req.KernelImagePath, + InitrdPath: req.InitrdPath, + KernelArgs: req.KernelArgs, + Drives: req.Drives, + TapDevice: req.TapDevice, + VSockPath: req.VSockPath, + VSockCID: req.VSockCID, + VCPUCount: req.VCPUCount, + MemoryMiB: req.MemoryMiB, + Logger: s.logger, + }) + if err != nil { + return 0, err + } + if err := machine.Start(ctx); err != nil { + manager := fcproc.New(s.runner, fcproc.Config{BridgeName: req.Network.BridgeName, BridgeIP: req.Network.BridgeIP, CIDR: req.Network.CIDR}, s.logger) + if pid := manager.ResolvePID(context.Background(), machine, req.SocketPath); pid > 0 { + _, _ = s.runner.Run(context.Background(), "kill", "-KILL", strconv.Itoa(pid)) + } + return 0, err + } + manager := fcproc.New(s.runner, fcproc.Config{BridgeName: req.Network.BridgeName, BridgeIP: req.Network.BridgeIP, CIDR: req.Network.CIDR}, s.logger) + if err := manager.EnsureSocketAccessFor(ctx, req.SocketPath, "firecracker api socket", s.meta.OwnerUID, s.meta.OwnerGID); err != nil { + return 0, err + } + if strings.TrimSpace(req.VSockPath) != "" { + if err := manager.EnsureSocketAccessFor(ctx, req.VSockPath, "firecracker vsock socket", s.meta.OwnerUID, s.meta.OwnerGID); err != nil { + return 0, err + } + } + pid := manager.ResolvePID(context.Background(), machine, req.SocketPath) + if pid <= 0 { + return 0, errors.New("firecracker started but pid could not be resolved") + } + return pid, nil +} + +func (s *Server) validateLaunchDrivePath(drive firecracker.DriveConfig, stateDir string) error { + if err := s.validateManagedPath(drive.Path, stateDir); err == nil { + return nil + } + if drive.IsRoot { + if err := validateDMDevicePath(drive.Path); err == nil { + return nil + } + } + return fmt.Errorf("path %q is outside banger-managed directories", drive.Path) +} + +func (s *Server) ensureSocketAccess(ctx context.Context, socketPath, label string) error { + return fcproc.New(s.runner, fcproc.Config{}, s.logger).EnsureSocketAccessFor(ctx, socketPath, label, s.meta.OwnerUID, s.meta.OwnerGID) +} + +func (s *Server) validateManagedPath(path string, roots ...string) error { + path = strings.TrimSpace(path) + if path == "" { + return errors.New("path is required") + } + if !filepath.IsAbs(path) { + return fmt.Errorf("path %q must be absolute", path) + } + cleaned := filepath.Clean(path) + for _, root := range roots { + root = strings.TrimSpace(root) + if root == "" { + continue + } + root = filepath.Clean(root) + if cleaned == root || strings.HasPrefix(cleaned, root+string(os.PathSeparator)) { + return nil + } + } + return fmt.Errorf("path %q is outside banger-managed directories", path) +} + +func validateTapName(tapName string) error { + tapName = strings.TrimSpace(tapName) + if strings.HasPrefix(tapName, vmTapPrefix) || strings.HasPrefix(tapName, tapPoolPrefix) { + return nil + } + return fmt.Errorf("tap %q is outside banger-managed naming", tapName) +} + +func validateDMName(dmName string) error { + dmName = strings.TrimSpace(dmName) + if strings.HasPrefix(dmName, rootfsDMNamePrefix) { + return nil + } + return fmt.Errorf("dm target %q is outside banger-managed naming", dmName) +} + +func validateDMDevicePath(path string) error { + path = strings.TrimSpace(path) + if path == "" { + return errors.New("dm device path is required") + } + if !filepath.IsAbs(path) { + return fmt.Errorf("dm device path %q must be absolute", path) + } + cleaned := filepath.Clean(path) + if filepath.Dir(cleaned) != "/dev/mapper" { + return fmt.Errorf("dm device path %q is outside /dev/mapper", path) + } + return validateDMName(filepath.Base(cleaned)) +} + +func validateRootExecutable(path string) error { + info, err := os.Stat(path) + if err != nil { + return err + } + if !info.Mode().IsRegular() { + return fmt.Errorf("firecracker binary %q is not a regular file", path) + } + if info.Mode().Perm()&0o111 == 0 { + return fmt.Errorf("firecracker binary %q is not executable", path) + } + if info.Mode().Perm()&0o022 != 0 { + return fmt.Errorf("firecracker binary %q must not be group/world writable", path) + } + stat, ok := info.Sys().(*syscall.Stat_t) + if !ok { + return fmt.Errorf("inspect owner for %q: unsupported file metadata", path) + } + if stat.Uid != 0 { + return fmt.Errorf("firecracker binary %q must be root-owned in system mode", path) + } + return nil +} + +func marshalResultOrError(v any, err error) rpc.Response { + if err != nil { + return rpc.NewError("operation_failed", err.Error()) + } + resp, marshalErr := rpc.NewResult(v) + if marshalErr != nil { + return rpc.NewError("marshal_failed", marshalErr.Error()) + } + return resp +} diff --git a/internal/roothelper/roothelper_test.go b/internal/roothelper/roothelper_test.go new file mode 100644 index 0000000..0570cb0 --- /dev/null +++ b/internal/roothelper/roothelper_test.go @@ -0,0 +1,55 @@ +package roothelper + +import ( + "testing" + + "banger/internal/firecracker" +) + +func TestValidateDMDevicePath(t *testing.T) { + t.Parallel() + + for _, tc := range []struct { + name string + path string + ok bool + }{ + {name: "valid", path: "/dev/mapper/fc-rootfs-test", ok: true}, + {name: "wrong_prefix", path: "/dev/mapper/not-banger", ok: false}, + {name: "wrong_dir", path: "/tmp/fc-rootfs-test", ok: false}, + {name: "relative", path: "fc-rootfs-test", ok: false}, + } { + tc := tc + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + err := validateDMDevicePath(tc.path) + if tc.ok && err != nil { + t.Fatalf("validateDMDevicePath(%q) = %v, want nil", tc.path, err) + } + if !tc.ok && err == nil { + t.Fatalf("validateDMDevicePath(%q) succeeded, want error", tc.path) + } + }) + } +} + +func TestValidateLaunchDrivePathAllowsManagedRootDMDevice(t *testing.T) { + t.Parallel() + + srv := &Server{} + if err := srv.validateLaunchDrivePath(firecracker.DriveConfig{ + ID: "rootfs", + Path: "/dev/mapper/fc-rootfs-test", + IsRoot: true, + }, "/var/lib/banger"); err != nil { + t.Fatalf("validateLaunchDrivePath(root dm) = %v, want nil", err) + } + + if err := srv.validateLaunchDrivePath(firecracker.DriveConfig{ + ID: "work", + Path: "/dev/mapper/fc-rootfs-test", + IsRoot: false, + }, "/var/lib/banger"); err == nil { + t.Fatal("validateLaunchDrivePath(non-root dm) succeeded, want error") + } +} diff --git a/internal/system/system.go b/internal/system/system.go index 800f396..84a74df 100644 --- a/internal/system/system.go +++ b/internal/system/system.go @@ -71,6 +71,12 @@ func (Runner) Run(ctx context.Context, name string, args ...string) ([]byte, err } func (r Runner) RunSudo(ctx context.Context, args ...string) ([]byte, error) { + if os.Geteuid() == 0 { + if len(args) == 0 { + return nil, errors.New("command is required") + } + return r.Run(ctx, args[0], args[1:]...) + } all := append([]string{"-n"}, args...) return r.Run(ctx, "sudo", all...) } @@ -95,6 +101,9 @@ func (Runner) RunStdin(ctx context.Context, stdin io.Reader, name string, args . } func EnsureSudo(ctx context.Context) error { + if os.Geteuid() == 0 { + return nil + } cmd := exec.CommandContext(ctx, "sudo", "-v") cmd.Stdout = os.Stdout cmd.Stderr = os.Stderr @@ -103,6 +112,9 @@ func EnsureSudo(ctx context.Context) error { } func CheckSudo(ctx context.Context) error { + if os.Geteuid() == 0 { + return nil + } if _, err := exec.LookPath("sudo"); err != nil { return err } diff --git a/scripts/smoke.sh b/scripts/smoke.sh old mode 100755 new mode 100644 index 616d062..0d5e95d --- a/scripts/smoke.sh +++ b/scripts/smoke.sh @@ -1,34 +1,27 @@ #!/usr/bin/env bash # -# scripts/smoke.sh — end-to-end smoke suite for banger. +# scripts/smoke.sh — end-to-end smoke suite for banger's supported +# two-service systemd model. # -# Drives a real create → start → ssh → exec → delete cycle against -# real Firecracker + real KVM on the host. Intended as a pre-release -# gate: the Go unit + integration tests don't and can't cover the -# post-machine.Start path (socket ownership, guest boot, vsock agent -# wait, guest SSH, workspace prepare). If this suite fails, don't -# ship. +# Installs instrumented binaries as temporary bangerd.service + +# bangerd-root.service, drives real Firecracker/KVM scenarios, collects +# covdata from both services plus the CLI, then purges the smoke-owned +# install on exit. # -# State lives under $BANGER_SMOKE_XDG_DIR (set by `make smoke`, -# defaults to build/smoke/xdg). It's ISOLATED from the invoking -# user's real banger install via XDG_{CONFIG,STATE,CACHE,RUNTIME} -# overrides, but PERSISTED across runs — so the first smoke pulls -# the golden image, subsequent smokes reuse it. `make smoke-clean` -# wipes it. +# Because the supported path is global host state, smoke refuses to +# overwrite a pre-existing non-smoke install. If a prior smoke crashed, +# rerun `make smoke-clean` or `make smoke`; the smoke marker lets the +# harness purge only its own stale install safely. # -# Invoked via `make smoke`, which sets the three env vars below. -# Don't run this directly unless you know they're set. +# Scratch files live under $BANGER_SMOKE_XDG_DIR (historic name kept for +# make-compat). Service state uses the real supported system paths and is +# purged by the smoke cleanup path. set -euo pipefail log() { printf '[smoke] %s\n' "$*" >&2; } die() { printf '[smoke] FAIL: %s\n' "$*" >&2; exit 1; } -# wait_for_ssh polls `vm ssh -- true` until it succeeds or the -# timeout expires. `vm ssh` — unlike `vm run` — does not itself wait -# for guest sshd, so scenarios that call `vm create` / `vm start` -# back-to-back with `vm ssh` need this shim. 60s matches -# vmRunSSHTimeout. wait_for_ssh() { local vm="$1" local deadline=$(( $(date +%s) + 60 )) @@ -43,7 +36,7 @@ wait_for_ssh() { : "${BANGER_SMOKE_BIN_DIR:?must point at the instrumented binary dir, set by make smoke}" : "${BANGER_SMOKE_COVER_DIR:?must point at the coverage dir, set by make smoke}" -: "${BANGER_SMOKE_XDG_DIR:?must point at the isolated XDG root, set by make smoke}" +: "${BANGER_SMOKE_XDG_DIR:?must point at the smoke scratch root, set by make smoke}" BANGER="$BANGER_SMOKE_BIN_DIR/banger" BANGERD="$BANGER_SMOKE_BIN_DIR/bangerd" @@ -53,53 +46,108 @@ for bin in "$BANGER" "$BANGERD" "$VSOCK_AGENT"; do [[ -x "$bin" ]] || die "binary missing or not executable: $bin" done -# Persistent XDG dirs (state, cache, config) so repeated smoke -# runs reuse the pulled golden image instead of re-downloading -# ~300MB each time. Runtime dir needs to be fresh per-run because -# it holds sockets the daemon cleans up on stop and refuses to -# reuse if any are stale. -mkdir -p \ - "$BANGER_SMOKE_XDG_DIR/config" \ - "$BANGER_SMOKE_XDG_DIR/state" \ - "$BANGER_SMOKE_XDG_DIR/cache" -runtime_dir="$(mktemp -d -t banger-smoke-runtime-XXXXXX)" -# shellcheck disable=SC2064 -trap "rm -rf '$runtime_dir'" EXIT -chmod 0700 "$runtime_dir" +scratch_root="$BANGER_SMOKE_XDG_DIR" +runtime_dir= +smoke_owner="$(id -un)" +smoke_marker='/etc/banger/.smoke-owned' +service_cover_dir='/var/lib/banger' +owner_service='bangerd.service' +root_service='bangerd-root.service' -export XDG_CONFIG_HOME="$BANGER_SMOKE_XDG_DIR/config" -export XDG_STATE_HOME="$BANGER_SMOKE_XDG_DIR/state" -export XDG_CACHE_HOME="$BANGER_SMOKE_XDG_DIR/cache" -export XDG_RUNTIME_DIR="$runtime_dir" +mkdir -p "$BANGER_SMOKE_COVER_DIR" +rm -rf "$scratch_root" +mkdir -p "$scratch_root" +runtime_dir="$(mktemp -d "$scratch_root/runtime-XXXXXX")" -# Point banger at its companion binaries inside the smoke build. -export BANGER_DAEMON_BIN="$BANGERD" -export BANGER_VSOCK_AGENT_BIN="$VSOCK_AGENT" - -# Instrumented binaries dump coverage here on clean exit. +# The CLI binary itself is instrumented, so keep its covdata local. export GOCOVERDIR="$BANGER_SMOKE_COVER_DIR" -mkdir -p "$GOCOVERDIR" -# Any smoke daemon left behind from a prior run that crashed mid- -# scenario would reuse the stale socket path and confuse -# ensureDaemon. Best-effort stop; ignore if nothing is running. -"$BANGER" daemon stop >/dev/null 2>&1 || true +cleanup_export_vm() { + "$BANGER" vm delete smoke-export >/dev/null 2>&1 || true +} -# banger's vmDNS binds 127.0.0.1:42069 (UDP) hard. If the user's -# real (non-smoke) daemon is running, its listener holds the port -# and the smoke daemon's Open() fails before any scenario runs. -# Fail fast with an actionable message — don't guess whether to -# stop the user's daemon for them. -if command -v ss >/dev/null 2>&1 && ss -Huln 2>/dev/null | awk '{print $4}' | grep -q '[:.]42069$'; then - die 'port 127.0.0.1:42069 is already bound (likely your real banger daemon); stop it with `banger daemon stop` and re-run `make smoke`' +cleanup_prune() { + "$BANGER" vm delete smoke-prune-running >/dev/null 2>&1 || true + "$BANGER" vm delete smoke-prune-stopped >/dev/null 2>&1 || true +} + +collect_service_coverage() { + local uid gid + uid="$(id -u)" + gid="$(id -g)" + sudo bash -lc ' + set -euo pipefail + shopt -s nullglob + dst="$1" + uid="$2" + gid="$3" + src="$4" + for file in "$src"/covmeta.* "$src"/covcounters.*; do + base="${file##*/}" + cp "$file" "$dst/$base" + chown "$uid:$gid" "$dst/$base" + chmod 0644 "$dst/$base" + done + ' bash "$BANGER_SMOKE_COVER_DIR" "$uid" "$gid" "$service_cover_dir" +} + +stop_services_for_coverage() { + sudo systemctl stop "$owner_service" "$root_service" >/dev/null 2>&1 || true +} + +sudo_banger() { + sudo env GOCOVERDIR="$BANGER_SMOKE_COVER_DIR" "$@" +} + +cleanup() { + set +e + for vm in \ + smoke-lifecycle smoke-set smoke-restart smoke-kill smoke-ports smoke-fc \ + smoke-basecommit smoke-nat smoke-nocnat; do + "$BANGER" vm delete "$vm" >/dev/null 2>&1 || true + done + cleanup_export_vm + cleanup_prune + stop_services_for_coverage + collect_service_coverage + sudo_banger "$BANGER" system uninstall --purge >/dev/null 2>&1 || true + rm -rf "$scratch_root" +} +trap cleanup EXIT + +if sudo test -f /etc/banger/install.toml; then + if sudo test -f "$smoke_marker"; then + log 'found stale smoke-owned install; purging it first' + sudo_banger "$BANGER" system uninstall --purge >/dev/null 2>&1 || true + else + die 'banger is already installed on this host; supported-path smoke refuses to overwrite a non-smoke install' + fi fi -# --- doctor ----------------------------------------------------------- +log 'installing smoke-owned services' +sudo env \ + GOCOVERDIR="$BANGER_SMOKE_COVER_DIR" \ + BANGER_SYSTEM_GOCOVERDIR="$service_cover_dir" \ + BANGER_ROOT_HELPER_GOCOVERDIR="$service_cover_dir" \ + "$BANGER" system install --owner "$smoke_owner" >/dev/null \ + || die 'system install failed' +sudo touch "$smoke_marker" + +status_out="$("$BANGER" system status)" || die 'system status failed after install' +grep -q 'active: active' <<<"$status_out" || die "owner daemon not active after install: $status_out" +grep -q 'helper_active: active' <<<"$status_out" || die "root helper not active after install: $status_out" + log 'doctor: checking host readiness' if ! "$BANGER" doctor; then die 'doctor reported failures; fix the host before running smoke' fi +log 'system restart: services should come back cleanly' +sudo_banger "$BANGER" system restart >/dev/null || die 'system restart failed' +status_out="$("$BANGER" system status)" || die 'system status failed after restart' +grep -q 'active: active' <<<"$status_out" || die "owner daemon not active after restart: $status_out" +grep -q 'helper_active: active' <<<"$status_out" || die "root helper not active after restart: $status_out" + # --- bare vm run ------------------------------------------------------ log "bare vm run: create + start + ssh + exec 'echo smoke-bare-ok' + --rm" bare_out="$("$BANGER" vm run --rm -- echo smoke-bare-ok)" || die "bare vm run exit $?" @@ -125,11 +173,6 @@ ws_out="$("$BANGER" vm run --rm "$repodir" -- cat /root/repo/smoke-file.txt)" || grep -q 'smoke-workspace-marker' <<<"$ws_out" || die "workspace vm run didn't ship smoke-file.txt: $ws_out" # --- command exit-code propagation ------------------------------------ -# A non-zero exit from the guest command must surface as banger's own -# exit code. Regressions here are hard to catch any other way — the -# local Go tests don't cross the SSH boundary, and users expect their -# CI scripts that wrap `banger vm run` to fail when the thing inside -# the VM failed. log 'exit-code propagation: guest `sh -c "exit 42"` must produce rc=42' set +e "$BANGER" vm run --rm -- sh -c 'exit 42' @@ -138,66 +181,35 @@ set -e [[ "$rc" -eq 42 ]] || die "exit-code propagation: got rc=$rc, want 42" # --- workspace dry-run (no VM) ---------------------------------------- -# Pure CLI-side path — no VM, no sudo, just the local git inspection -# against d.repoInspector. Fast; catches regressions in the preview -# output (file list shape, mode line) that the Go tests already pin -# but that could still be broken by a client-side wiring change. log 'workspace dry-run: list tracked files without creating a VM' dry_out="$("$BANGER" vm run --dry-run "$repodir")" || die "dry-run exit $?" grep -q 'smoke-file.txt' <<<"$dry_out" || die "dry-run didn't list smoke-file.txt: $dry_out" grep -q 'mode: tracked only' <<<"$dry_out" || die "dry-run mode line missing or wrong: $dry_out" # --- workspace --include-untracked ----------------------------------- -# The default is tracked-only (review cycle 4). Opt-in must ship -# untracked files too. Write one, run with --include-untracked, verify -# it reaches the guest. log 'workspace --include-untracked: opt-in ships files outside the git index' echo 'untracked-marker' > "$repodir/smoke-untracked.txt" inc_out="$("$BANGER" vm run --rm --include-untracked "$repodir" -- cat /root/repo/smoke-untracked.txt)" || die "include-untracked vm run exit $?" grep -q 'untracked-marker' <<<"$inc_out" || die "--include-untracked didn't ship the untracked file: $inc_out" -# Restore repo to tracked-only state for any later scenarios. rm -f "$repodir/smoke-untracked.txt" # --- workspace export round-trip -------------------------------------- -# Exercises ExportVMWorkspace: create a VM, prepare the workspace, -# write a new file inside the guest, then export and assert the -# emitted patch sees the guest-side change. If the export pipeline -# (temp-index, git add -A, diff --binary) ever stops capturing -# guest-side changes, this scenario catches it. log 'workspace export: create + prepare + guest edit + export + assert marker' -export_vm='smoke-export' -cleanup_export_vm() { - "$BANGER" vm delete "$export_vm" >/dev/null 2>&1 || true -} -# Chain the VM cleanup with the existing runtime_dir trap so a mid- -# scenario failure still tears the VM down before the script exits. -# shellcheck disable=SC2064 -trap "cleanup_export_vm; rm -rf '$runtime_dir'" EXIT - -"$BANGER" vm create --name "$export_vm" --image debian-bookworm >/dev/null \ +"$BANGER" vm create --name smoke-export --image debian-bookworm >/dev/null \ || die "export: vm create exit $?" -"$BANGER" vm workspace prepare "$export_vm" "$repodir" >/dev/null \ +"$BANGER" vm workspace prepare smoke-export "$repodir" >/dev/null \ || die "export: workspace prepare exit $?" -"$BANGER" vm ssh "$export_vm" -- sh -c 'echo guest-edit > /root/repo/new-guest-file.txt' \ +"$BANGER" vm ssh smoke-export -- sh -c 'echo guest-edit > /root/repo/new-guest-file.txt' \ || die "export: guest-side file write exit $?" export_patch="$runtime_dir/smoke-export.diff" -"$BANGER" vm workspace export "$export_vm" --output "$export_patch" \ +"$BANGER" vm workspace export smoke-export --output "$export_patch" \ || die "export: workspace export exit $?" [[ -s "$export_patch" ]] || die "export: patch file empty at $export_patch" grep -q 'new-guest-file.txt' "$export_patch" \ || die "export: patch missing new-guest-file.txt marker (head: $(head -c 400 "$export_patch"))" - cleanup_export_vm -# shellcheck disable=SC2064 -trap "rm -rf '$runtime_dir'" EXIT # --- concurrent vm runs ----------------------------------------------- -# Stresses per-VM lock scoping, the tap pool warm-up path, and -# createVMMu's narrow reservation window. Two `vm run --rm` invocations -# that actually overlap should both succeed. A regression that -# serialises create path too aggressively would make this slow but -# still pass; a regression that breaks tap allocation or name -# uniqueness would fail one of them. log 'concurrent vm runs: two --rm invocations must both succeed' tmpA="$runtime_dir/concurrent-a.out" tmpB="$runtime_dir/concurrent-b.out" @@ -211,18 +223,8 @@ grep -q 'smoke-concurrent-a' "$tmpA" || die "concurrent VM A missing marker: $(c grep -q 'smoke-concurrent-b' "$tmpB" || die "concurrent VM B missing marker: $(cat "$tmpB")" # --- vm lifecycle (create → stop → start → delete) -------------------- -# Exercises lifecycle verbs directly instead of the --rm convenience -# path. The critical assertion is the second `vm ssh` AFTER stop/start: -# that path (a) rebuilds the handle cache via rediscoverHandles, -# (b) runs the e2fsck-snapshot sanitize step before patchRootOverlay -# on the dirty COW, and (c) shouldn't die from the SDK's -# ctx-SIGTERM-on-RPC-close goroutine. All three were bugs at one -# point; this scenario guards all three at once. log 'vm lifecycle: explicit create / stop / start / ssh / delete' lifecycle_name=smoke-lifecycle -# shellcheck disable=SC2064 -trap "\"$BANGER\" vm delete $lifecycle_name >/dev/null 2>&1 || true; rm -rf '$runtime_dir'" EXIT - "$BANGER" vm create --name "$lifecycle_name" >/dev/null || die "vm create $lifecycle_name failed" show_out="$("$BANGER" vm show "$lifecycle_name")" || die "vm show after create failed" grep -q '"state": "running"' <<<"$show_out" || die "post-create state not running: $show_out" @@ -249,18 +251,9 @@ set +e rc=$? set -e [[ "$rc" -ne 0 ]] || die "vm show still finds $lifecycle_name after delete" -# shellcheck disable=SC2064 -trap "rm -rf '$runtime_dir'" EXIT # --- vm set reconfiguration (vcpu change + restart) ------------------- -# Exercises SetVM + configChangeCapability. Create with --vcpu 2, -# stop, `vm set --vcpu 4`, restart, confirm the guest sees the new -# count. Regression guard: a restart that reuses the pre-change spec -# would leave nproc at 2. log 'vm set: create --vcpu 2 → stop → set --vcpu 4 → restart → nproc=4' -# shellcheck disable=SC2064 -trap "\"$BANGER\" vm delete smoke-set >/dev/null 2>&1 || true; rm -rf '$runtime_dir'" EXIT - "$BANGER" vm create --name smoke-set --vcpu 2 >/dev/null || die 'vm set: create failed' wait_for_ssh smoke-set || die 'vm set: initial ssh did not come up' @@ -286,19 +279,9 @@ set -e || die "vm set: post-reconfig nproc got '$nproc_after', want 4 (spec change didn't land)" "$BANGER" vm delete smoke-set >/dev/null || die 'vm set: delete failed' -# shellcheck disable=SC2064 -trap "rm -rf '$runtime_dir'" EXIT # --- vm restart (dedicated verb) -------------------------------------- -# `vm restart` is its own verb, not a stop+start composite at the API -# level — it must end up with a freshly booted guest. The assertion is -# a fresh boot ID: /proc/sys/kernel/random/boot_id changes on every -# kernel boot, so post-restart != pre-restart proves the kernel was -# actually recycled rather than the verb no-op'ing. log 'vm restart: boot_id must change across the verb' -# shellcheck disable=SC2064 -trap "\"$BANGER\" vm delete smoke-restart >/dev/null 2>&1 || true; rm -rf '$runtime_dir'" EXIT - "$BANGER" vm create --name smoke-restart >/dev/null || die 'vm restart: create failed' wait_for_ssh smoke-restart || die 'vm restart: initial ssh never came up' boot_before="$("$BANGER" vm ssh smoke-restart -- cat /proc/sys/kernel/random/boot_id | tr -d '[:space:]')" @@ -312,19 +295,9 @@ boot_after="$("$BANGER" vm ssh smoke-restart -- cat /proc/sys/kernel/random/boot || die "vm restart: boot_id unchanged ($boot_before); verb didn't actually reboot the guest" "$BANGER" vm delete smoke-restart >/dev/null || die 'vm restart: delete failed' -# shellcheck disable=SC2064 -trap "rm -rf '$runtime_dir'" EXIT # --- vm kill (--signal KILL, forceful path) --------------------------- -# `vm stop` takes the graceful Ctrl-Alt-Del route. `vm kill --signal -# KILL` is the explicit "the guest is wedged, drop it" path. It must -# (a) terminate firecracker, (b) leave the VM record in a stopped -# state (not 'error'), (c) tear down the dm-snapshot + loops so the -# next create/start doesn't trip over leftovers. log 'vm kill --signal KILL: forceful terminate, state=stopped, no leaked dm device' -# shellcheck disable=SC2064 -trap "\"$BANGER\" vm delete smoke-kill >/dev/null 2>&1 || true; rm -rf '$runtime_dir'" EXIT - "$BANGER" vm create --name smoke-kill >/dev/null || die 'vm kill: create failed' dm_name="$("$BANGER" vm show smoke-kill 2>/dev/null | awk -F'"' '/"dm_dev"|fc-rootfs-/ {for(i=1;i<=NF;i++) if($i~/^fc-rootfs-/) print $i}' | head -1 || true)" "$BANGER" vm kill --signal KILL smoke-kill >/dev/null || die 'vm kill: verb failed' @@ -336,22 +309,9 @@ if [[ -n "$dm_name" ]]; then fi fi "$BANGER" vm delete smoke-kill >/dev/null || die 'vm kill: delete failed' -# shellcheck disable=SC2064 -trap "rm -rf '$runtime_dir'" EXIT # --- vm prune (-f) ---------------------------------------------------- -# Create two VMs: one running, one stopped. `vm prune -f` must delete -# the stopped one and leave the running one alone. Skip interactive -# confirmation with -f (smoke has no tty). Regression guard: a bug -# that deleted the running VM would wreck any session the user had. log 'vm prune -f: removes stopped VMs, preserves running ones' -cleanup_prune() { - "$BANGER" vm delete smoke-prune-running >/dev/null 2>&1 || true - "$BANGER" vm delete smoke-prune-stopped >/dev/null 2>&1 || true -} -# shellcheck disable=SC2064 -trap "cleanup_prune; rm -rf '$runtime_dir'" EXIT - "$BANGER" vm create --name smoke-prune-running >/dev/null || die 'vm prune: create running failed' "$BANGER" vm create --name smoke-prune-stopped >/dev/null || die 'vm prune: create stopped failed' "$BANGER" vm stop smoke-prune-stopped >/dev/null || die 'vm prune: stop the stopped one failed' @@ -364,20 +324,9 @@ if "$BANGER" vm show smoke-prune-stopped >/dev/null 2>&1; then fi "$BANGER" vm delete smoke-prune-running >/dev/null || die 'vm prune: cleanup delete failed' -# shellcheck disable=SC2064 -trap "rm -rf '$runtime_dir'" EXIT # --- vm ports --------------------------------------------------------- -# sshd binds :22 in every guest — it's the minimum promise of a VM. -# If `vm ports` can't see that, the host→guest port visibility pipe -# (vsock-agent on-demand query, daemon aggregation, CLI rendering) is -# broken. Endpoint shape is also asserted: daemon prefers the -# .vm DNS record over the raw guest IP, so we grep for the -# name form. log 'vm ports: sshd :22 visible from host, endpoint uses the VM DNS name' -# shellcheck disable=SC2064 -trap "\"$BANGER\" vm delete smoke-ports >/dev/null 2>&1 || true; rm -rf '$runtime_dir'" EXIT - "$BANGER" vm create --name smoke-ports >/dev/null || die 'vm ports: create failed' wait_for_ssh smoke-ports || die 'vm ports: ssh did not come up' @@ -389,18 +338,9 @@ grep -q 'sshd' <<<"$ports_out" \ || die "vm ports: expected process 'sshd' in output; got: $ports_out" "$BANGER" vm delete smoke-ports >/dev/null || die 'vm ports: delete failed' -# shellcheck disable=SC2064 -trap "rm -rf '$runtime_dir'" EXIT # --- workspace prepare --mode full_copy ------------------------------- -# Default mode is shallow_overlay. full_copy copies the repo via a -# different transfer path (tar stream into the guest's rootfs with -# no overlay). Smoke asserts it still lands the content at the same -# guest path. log 'workspace prepare --mode full_copy: alternate transfer path still delivers' -# shellcheck disable=SC2064 -trap "\"$BANGER\" vm delete smoke-fc >/dev/null 2>&1 || true; rm -rf '$runtime_dir'" EXIT - "$BANGER" vm create --name smoke-fc >/dev/null || die 'workspace fc: create failed' "$BANGER" vm workspace prepare smoke-fc "$repodir" --mode full_copy >/dev/null \ || die 'workspace fc: prepare --mode full_copy failed' @@ -410,42 +350,26 @@ grep -q 'smoke-workspace-marker' <<<"$fc_out" \ || die "workspace fc: marker missing in full_copy workspace: $fc_out" "$BANGER" vm delete smoke-fc >/dev/null || die 'workspace fc: delete failed' -# shellcheck disable=SC2064 -trap "rm -rf '$runtime_dir'" EXIT # --- workspace export --base-commit (committed guest delta) ----------- -# Without --base-commit, export diffs the worktree against HEAD — it -# misses commits the worker made inside the guest (because the guest -# HEAD advanced). With --base-commit pinned at the prepare-time SHA, -# those commits land in the patch. This is the happy path the feature -# was added for; zero coverage until now. log 'workspace export --base-commit: guest-side commits captured in patch' -# shellcheck disable=SC2064 -trap "\"$BANGER\" vm delete smoke-basecommit >/dev/null 2>&1 || true; rm -rf '$runtime_dir'" EXIT - "$BANGER" vm create --name smoke-basecommit >/dev/null || die 'export base: create failed' "$BANGER" vm workspace prepare smoke-basecommit "$repodir" >/dev/null \ || die 'export base: prepare failed' -# Capture the prepare-time HEAD from the guest directly (same SHA the -# daemon returns as HeadCommit in the RPC result). base_sha="$("$BANGER" vm ssh smoke-basecommit -- sh -c 'cd /root/repo && git rev-parse HEAD' | tr -d '[:space:]')" [[ "${#base_sha}" -eq 40 ]] || die "export base: bad base sha: $base_sha" -# Make a guest-side commit: new file + git add + git commit. Without -# --base-commit, this commit would be invisible to a HEAD-relative diff. "$BANGER" vm ssh smoke-basecommit -- sh -c "cd /root/repo && git -c user.email=smoke@smoke -c user.name=smoke checkout -b smoke-branch >/dev/null 2>&1 && echo committed-marker > smoke-committed.txt && git add smoke-committed.txt && git -c user.email=smoke@smoke -c user.name=smoke commit -q -m 'guest side'" \ || die 'export base: guest-side commit failed' -# Control: plain export (no --base-commit) must NOT see the committed file. plain_patch="$runtime_dir/smoke-plain.diff" "$BANGER" vm workspace export smoke-basecommit --output "$plain_patch" \ || die 'export base: plain export failed' -if grep -q 'smoke-committed.txt' "$plain_patch"; then +if [[ -f "$plain_patch" ]] && grep -q 'smoke-committed.txt' "$plain_patch"; then die 'export base: plain export unexpectedly captured the guest-side commit' fi -# With --base-commit pinned to the pre-commit SHA, the delta appears. base_patch="$runtime_dir/smoke-base.diff" "$BANGER" vm workspace export smoke-basecommit --base-commit "$base_sha" --output "$base_patch" \ || die 'export base: --base-commit export failed' @@ -454,21 +378,11 @@ grep -q 'smoke-committed.txt' "$base_patch" \ || die "export base: --base-commit patch missing committed marker (head: $(head -c 400 "$base_patch"))" "$BANGER" vm delete smoke-basecommit >/dev/null || die 'export base: delete failed' -# shellcheck disable=SC2064 -trap "rm -rf '$runtime_dir'" EXIT # --- ssh-config install / uninstall (HOME-isolated) ------------------- -# `banger ssh-config --install` edits ~/.ssh/config. Smoke runs under -# the invoking user, so we isolate by pointing HOME at the smoke XDG -# dir before the commands run (os.UserHomeDir respects $HOME on -# Linux). No daemon / VM involved — pure CLI + filesystem surface, -# exercising the install/status/uninstall code paths end-to-end. log 'ssh-config --install / --uninstall: idempotent, survives round-trip' -fake_home="$BANGER_SMOKE_XDG_DIR/fake-home" +fake_home="$scratch_root/fake-home" mkdir -p "$fake_home/.ssh" -# Seed a pre-existing ~/.ssh/config so install must APPEND, not -# replace. A bug that clobbered pre-existing content would nuke the -# user's real config on first run. printf 'Host myserver\n HostName example.invalid\n' > "$fake_home/.ssh/config" ( @@ -479,7 +393,6 @@ printf 'Host myserver\n HostName example.invalid\n' > "$fake_home/.ssh/config" grep -q '^Host myserver' "$fake_home/.ssh/config" \ || die 'ssh-config: install clobbered pre-existing content (!!)' - # Second install must be idempotent (no duplicate Include lines). "$BANGER" ssh-config --install >/dev/null || die 'ssh-config: second install failed' include_count="$(grep -c '^Include .*banger' "$fake_home/.ssh/config")" [[ "$include_count" == "1" ]] \ @@ -494,22 +407,10 @@ printf 'Host myserver\n HostName example.invalid\n' > "$fake_home/.ssh/config" ) # --- NAT rule installation (per-VM MASQUERADE) ------------------------ -# `--nat` installs a per-VM iptables POSTROUTING MASQUERADE rule -# scoped to the guest's /32 (see natCapability). End-to-end curl -# tests don't work here because the bridge IP and the host's uplink -# IP both belong to the host — a guest reaching the uplink address -# lands on the host's local loopback whether MASQUERADE is set up -# or not. So assert the rule itself: NAT VM gets a POSTROUTING -# MASQUERADE, non-NAT VM does not. This catches the two most -# plausible regressions (rule never installed; rule not scoped to -# the right VM) without depending on an external reachable host. log 'NAT: --nat installs a per-VM MASQUERADE rule; no --nat means no rule' if ! sudo -n iptables -t nat -S POSTROUTING >/dev/null 2>&1; then log 'NAT: skipping — passwordless sudo iptables unavailable' else - # shellcheck disable=SC2064 - trap "\"$BANGER\" vm delete smoke-nat >/dev/null 2>&1 || true; \"$BANGER\" vm delete smoke-nocnat >/dev/null 2>&1 || true; rm -rf '$runtime_dir'" EXIT - "$BANGER" vm create --name smoke-nat --nat >/dev/null || die 'NAT: create --nat failed' "$BANGER" vm create --name smoke-nocnat >/dev/null || die 'NAT: control create failed' @@ -524,9 +425,6 @@ else die "NAT: control VM unexpectedly has a MASQUERADE rule for $ctl_ip" fi - # Stop + start the --nat VM to exercise the install-is-idempotent - # path (capability runs again on each start; a buggy add-without- - # check would leave two identical rules behind). "$BANGER" vm stop smoke-nat >/dev/null || die 'NAT: stop --nat VM failed' "$BANGER" vm start smoke-nat >/dev/null || die 'NAT: restart --nat VM failed' postrouting="$(sudo -n iptables -t nat -S POSTROUTING 2>/dev/null || true)" @@ -534,7 +432,6 @@ else [[ "$rule_count" == "1" ]] \ || die "NAT: MASQUERADE rule count for $nat_ip = $rule_count after restart, want 1" - # Delete must tear the rule down — regression guard against leaks. "$BANGER" vm delete smoke-nat >/dev/null || die 'NAT: delete --nat VM failed' "$BANGER" vm delete smoke-nocnat >/dev/null || die 'NAT: delete control VM failed' postrouting="$(sudo -n iptables -t nat -S POSTROUTING 2>/dev/null || true)" @@ -542,15 +439,8 @@ else die "NAT: delete left a MASQUERADE rule behind for $nat_ip" fi fi -# shellcheck disable=SC2064 -trap "rm -rf '$runtime_dir'" EXIT # --- invalid spec rejection + no artifact leak ------------------------ -# Tests the negative-path create flow: a blatantly invalid VM spec -# must fail before any VM row is persisted. The review cycle flagged -# "cleanup on partial failure" as under-tested; this scenario pins -# that a rejected create doesn't leak a reservation we then have to -# clean up by hand. log 'invalid spec rejection: --vcpu 0 must fail and leave no VM behind' pre_vms="$("$BANGER" vm list --all 2>/dev/null | wc -l)" set +e @@ -562,13 +452,6 @@ post_vms="$("$BANGER" vm list --all 2>/dev/null | wc -l)" [[ "$pre_vms" == "$post_vms" ]] || die "invalid spec leaked a VM row: pre=$pre_vms, post=$post_vms" # --- invalid name rejection ------------------------------------------ -# VM names become DNS labels, guest hostnames, kernel-cmdline tokens -# and file-path fragments — the validator (ValidateVMName) must reject -# anything that isn't [a-z0-9-] with no leading/trailing hyphen and no -# dots. Smoke covers a few of the worst offenders end-to-end through -# the CLI; the full character-class matrix lives in -# internal/model/vm_name_test.go. Rejected names must also leave no -# VM row behind. log 'invalid name rejection: uppercase / space / dot / leading-hyphen must all fail' pre_vms="$("$BANGER" vm list --all 2>/dev/null | wc -l)" for bad in 'MyBox' 'my box' 'box.vm' '-box'; do @@ -582,11 +465,4 @@ post_vms="$("$BANGER" vm list --all 2>/dev/null | wc -l)" [[ "$pre_vms" == "$post_vms" ]] \ || die "invalid name leaked VM row(s): pre=$pre_vms, post=$post_vms" -# --- daemon stop (flushes coverage) ----------------------------------- -log 'stopping daemon so instrumented binaries flush coverage' -"$BANGER" daemon stop >/dev/null 2>&1 || true -# Give the daemon a moment to write its covdata pod before the trap -# tears down runtime_dir. -sleep 0.5 - log 'all scenarios passed'