banger/internal/cli/commands_system.go
Thales Maciel 2606bfbabb
update: VMs survive banger update and rollback
Three load-bearing fixes that together let `banger update` (and its
auto-rollback path) restart the helper + daemon without killing
every running VM. New smoke scenarios prove the property end-to-end.

Bug fixes:

1. Disable the firecracker SDK's signal-forwarding goroutine. The
   default ForwardSignals = [SIGINT, SIGQUIT, SIGTERM, SIGHUP,
   SIGABRT] installs a handler in the helper that propagates the
   helper's SIGTERM (sent by systemd on `systemctl stop bangerd-
   root.service`) to every running firecracker child. Set
   ForwardSignals to an empty (non-nil) slice so setupSignals
   short-circuits at len()==0.

2. Add SendSIGKILL=no to bangerd-root.service. KillMode=process
   limits the initial SIGTERM to the helper main, but systemd
   still SIGKILLs leftover cgroup processes during the
   FinalKillSignal stage unless SendSIGKILL=no.

3. Route restart-helper / restart-daemon / wait-daemon-ready
   failures through rollbackAndRestart instead of rollbackAndWrap.
   rollbackAndWrap restored .previous binaries but didn't re-
   restart the failed unit, leaving the helper dead with the
   rolled-back binary on disk after a failed update.

Testing infrastructure (production binaries unaffected):

- Hidden --manifest-url and --pubkey-file flags on `banger update`
  let the smoke harness redirect the updater at locally-built
  release artefacts. Marked Hidden in cobra; not advertised in
  --help.
- FetchManifestFrom / VerifyBlobSignatureWithKey /
  FetchAndVerifySignatureWithKey export the existing logic against
  caller-supplied URL / pubkey. The default entry points still
  call them with the embedded canonical values.

Smoke scenarios:

- update_check: --check against fake manifest reports update
  available
- update_to_unknown: --to v9.9.9 fails before any host mutation
- update_no_root: refuses without sudo, install untouched
- update_dry_run: stages + verifies, no swap, version unchanged
- update_keeps_vm_alive: real swap to v0.smoke.0; same VM (same
  boot_id) answers SSH after the daemon restart
- update_rollback_keeps_vm_alive: v0.smoke.broken-bangerd ships a
  bangerd that passes --check-migrations but exits 1 as the
  daemon. The post-swap `systemctl restart bangerd` fails,
  rollbackAndRestart fires, the .previous binaries are restored
  and re-restarted; the same VM still answers SSH afterwards
- daemon_admin (separate prep): covers `banger daemon socket`,
  `bangerd --check-migrations --system`, `sudo banger daemon
  stop`

The smoke release builder generates a fresh ECDSA P-256 keypair
with openssl, signs SHA256SUMS cosign-compatibly, and serves
artefacts from a backgrounded python http.server.
verify_smoke_check_test.go pins the openssl/cosign signature
equivalence so the smoke release builder can't silently drift.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 12:08:08 -03:00

485 lines
16 KiB
Go

package cli
import (
"context"
"errors"
"fmt"
"io"
"os"
"path/filepath"
"strconv"
"strings"
"text/tabwriter"
"banger/internal/buildinfo"
"banger/internal/installmeta"
"banger/internal/model"
"banger/internal/paths"
"banger/internal/system"
"github.com/spf13/cobra"
)
const (
systemBangerBin = "/usr/local/bin/banger"
systemBangerdBin = "/usr/local/bin/bangerd"
systemCompanionDir = "/usr/local/lib/banger"
systemCompanionAgent = systemCompanionDir + "/banger-vsock-agent"
systemdUserUnitPath = "/etc/systemd/system/" + installmeta.DefaultService
systemdRootUnitPath = "/etc/systemd/system/" + installmeta.DefaultRootHelperService
systemCoverDirEnv = "BANGER_SYSTEM_GOCOVERDIR"
rootCoverDirEnv = "BANGER_ROOT_HELPER_GOCOVERDIR"
)
func (d *deps) newSystemCommand() *cobra.Command {
var owner string
var purge bool
cmd := &cobra.Command{
Use: "system",
Short: "Install banger's owner-daemon and root-helper systemd units",
Long: strings.TrimSpace(`
Banger ships as two services: an owner-user daemon for
orchestration and a narrow root helper for bridge/tap, NAT, and
Firecracker launch. 'banger system' installs, restarts, inspects,
and removes them.
First-run flow (must be run as root):
sudo banger system install --owner $USER install both services
banger system status confirm they're up
banger doctor check host readiness
After 'install', the owner user can run 'banger ...' day to day
without sudo. Subsequent invocations:
sudo banger system restart bounce both services
sudo banger system uninstall remove services + binaries
sudo banger system uninstall --purge also delete /var/lib/banger
See docs/privileges.md for the full trust model.
`),
Example: strings.TrimSpace(`
sudo banger system install --owner alice
banger system status
sudo banger system uninstall --purge
`),
RunE: helpNoArgs,
}
installCmd := &cobra.Command{
Use: "install",
Short: "Install or refresh the owner daemon and root helper",
Args: noArgsUsage("usage: banger system install [--owner USER]"),
RunE: func(cmd *cobra.Command, args []string) error {
return d.runSystemInstall(cmd.Context(), cmd.OutOrStdout(), owner)
},
}
installCmd.Flags().StringVar(&owner, "owner", "", "login user who will operate banger day-to-day")
statusCmd := &cobra.Command{
Use: "status",
Short: "Show owner-daemon and root-helper status",
Args: noArgsUsage("usage: banger system status"),
RunE: func(cmd *cobra.Command, args []string) error {
return d.runSystemStatus(cmd.Context(), cmd.OutOrStdout())
},
}
restartCmd := &cobra.Command{
Use: "restart",
Short: "Restart the installed banger services",
Args: noArgsUsage("usage: banger system restart"),
RunE: func(cmd *cobra.Command, args []string) error {
if err := requireRoot(); err != nil {
return err
}
if err := d.runSystemctl(cmd.Context(), "restart", installmeta.DefaultRootHelperService); err != nil {
return err
}
if err := d.runSystemctl(cmd.Context(), "restart", installmeta.DefaultService); err != nil {
return err
}
if err := d.waitForDaemonReady(cmd.Context(), paths.ResolveSystem().SocketPath); err != nil {
return err
}
_, err := fmt.Fprintln(cmd.OutOrStdout(), "restarted")
return err
},
}
uninstallCmd := &cobra.Command{
Use: "uninstall",
Short: "Remove the installed banger services",
Args: noArgsUsage("usage: banger system uninstall [--purge]"),
RunE: func(cmd *cobra.Command, args []string) error {
return d.runSystemUninstall(cmd.Context(), cmd.OutOrStdout(), purge)
},
}
uninstallCmd.Flags().BoolVar(&purge, "purge", false, "also delete system-owned banger state and cache")
cmd.AddCommand(installCmd, statusCmd, restartCmd, uninstallCmd)
return cmd
}
func (d *deps) runSystemInstall(ctx context.Context, out io.Writer, ownerFlag string) error {
if err := requireRoot(); err != nil {
return err
}
meta, err := resolveInstallOwner(ownerFlag)
if err != nil {
return err
}
info := buildinfo.Current()
meta.Version = info.Version
meta.Commit = info.Commit
meta.BuiltAt = info.BuiltAt
meta.InstalledAt = model.Now()
bangerBin, err := paths.BangerPath()
if err != nil {
return err
}
bangerdBin, err := paths.BangerdPath()
if err != nil {
return err
}
agentBin, err := paths.CompanionBinaryPath("banger-vsock-agent")
if err != nil {
return err
}
if err := os.MkdirAll(filepath.Dir(systemBangerBin), 0o755); err != nil {
return err
}
if err := os.MkdirAll(systemCompanionDir, 0o755); err != nil {
return err
}
if err := installFile(bangerBin, systemBangerBin, 0o755); err != nil {
return err
}
if err := installFile(bangerdBin, systemBangerdBin, 0o755); err != nil {
return err
}
if err := installFile(agentBin, systemCompanionAgent, 0o755); err != nil {
return err
}
if err := installmeta.Save(installmeta.DefaultPath, meta); err != nil {
return err
}
if err := paths.EnsureSystem(paths.ResolveSystem()); err != nil {
return err
}
if err := os.WriteFile(systemdRootUnitPath, []byte(renderRootHelperSystemdUnit()), 0o644); err != nil {
return err
}
if err := os.WriteFile(systemdUserUnitPath, []byte(renderSystemdUnit(meta)), 0o644); err != nil {
return err
}
if err := d.runSystemctl(ctx, "daemon-reload"); err != nil {
return err
}
if err := d.runSystemctl(ctx, "enable", installmeta.DefaultRootHelperService); err != nil {
return err
}
if err := d.runSystemctl(ctx, "enable", installmeta.DefaultService); err != nil {
return err
}
if err := d.runSystemctl(ctx, "restart", installmeta.DefaultRootHelperService); err != nil {
return err
}
if err := d.runSystemctl(ctx, "restart", installmeta.DefaultService); err != nil {
return err
}
if err := d.waitForDaemonReady(ctx, installmeta.DefaultSocketPath); err != nil {
return err
}
if _, err := fmt.Fprintln(out, "installed"); err != nil {
return err
}
w := tabwriter.NewWriter(out, 0, 8, 2, ' ', 0)
fmt.Fprintf(w, "owner\t%s\n", meta.OwnerUser)
fmt.Fprintf(w, "socket\t%s\n", installmeta.DefaultSocketPath)
fmt.Fprintf(w, "helper_socket\t%s\n", installmeta.DefaultRootHelperSocketPath)
fmt.Fprintf(w, "service\t%s\n", installmeta.DefaultService)
fmt.Fprintf(w, "helper_service\t%s\n", installmeta.DefaultRootHelperService)
return w.Flush()
}
func (d *deps) runSystemStatus(ctx context.Context, out io.Writer) error {
layout := paths.ResolveSystem()
active := d.systemctlQuery(ctx, "is-active", installmeta.DefaultService)
if active == "" {
active = "unknown"
}
enabled := d.systemctlQuery(ctx, "is-enabled", installmeta.DefaultService)
if enabled == "" {
enabled = "unknown"
}
helperActive := d.systemctlQuery(ctx, "is-active", installmeta.DefaultRootHelperService)
if helperActive == "" {
helperActive = "unknown"
}
helperEnabled := d.systemctlQuery(ctx, "is-enabled", installmeta.DefaultRootHelperService)
if helperEnabled == "" {
helperEnabled = "unknown"
}
w := tabwriter.NewWriter(out, 0, 8, 2, ' ', 0)
fmt.Fprintf(w, "service\t%s\n", installmeta.DefaultService)
fmt.Fprintf(w, "enabled\t%s\n", enabled)
fmt.Fprintf(w, "active\t%s\n", active)
fmt.Fprintf(w, "helper_service\t%s\n", installmeta.DefaultRootHelperService)
fmt.Fprintf(w, "helper_enabled\t%s\n", helperEnabled)
fmt.Fprintf(w, "helper_active\t%s\n", helperActive)
fmt.Fprintf(w, "socket\t%s\n", layout.SocketPath)
fmt.Fprintf(w, "helper_socket\t%s\n", installmeta.DefaultRootHelperSocketPath)
fmt.Fprintf(w, "log\tjournalctl -u %s -u %s\n", installmeta.DefaultService, installmeta.DefaultRootHelperService)
if ping, err := d.daemonPing(ctx, layout.SocketPath); err == nil {
info := buildinfo.Normalize(ping.Version, ping.Commit, ping.BuiltAt)
fmt.Fprintf(w, "pid\t%d\n", ping.PID)
fmt.Fprintf(w, "version\t%s\n", info.Version)
if info.Commit != "" {
fmt.Fprintf(w, "commit\t%s\n", info.Commit)
}
if info.BuiltAt != "" {
fmt.Fprintf(w, "built_at\t%s\n", info.BuiltAt)
}
}
return w.Flush()
}
func (d *deps) runSystemUninstall(ctx context.Context, out io.Writer, purge bool) error {
if err := requireRoot(); err != nil {
return err
}
_ = d.runSystemctl(ctx, "disable", "--now", installmeta.DefaultService, installmeta.DefaultRootHelperService)
_ = os.Remove(systemdUserUnitPath)
_ = os.Remove(systemdRootUnitPath)
_ = os.Remove(installmeta.DefaultPath)
_ = os.Remove(installmeta.DefaultDir)
_ = d.runSystemctl(ctx, "daemon-reload")
_ = os.Remove(systemBangerdBin)
_ = os.Remove(systemBangerBin)
_ = os.RemoveAll(systemCompanionDir)
if purge {
_ = os.RemoveAll(paths.ResolveSystem().StateDir)
_ = os.RemoveAll(paths.ResolveSystem().CacheDir)
_ = os.RemoveAll(paths.ResolveSystem().RuntimeDir)
}
msg := "uninstalled"
if purge {
msg += " (purged state)"
}
_, err := fmt.Fprintln(out, msg)
return err
}
func resolveInstallOwner(ownerFlag string) (installmeta.Metadata, error) {
owner := strings.TrimSpace(ownerFlag)
if owner == "" {
owner = strings.TrimSpace(os.Getenv("SUDO_USER"))
}
if owner == "" {
return installmeta.Metadata{}, errors.New("owner is required; pass --owner USER when installing without sudo")
}
if owner == "root" {
return installmeta.Metadata{}, errors.New("refusing to install with root as the banger owner")
}
return installmeta.LookupOwner(owner)
}
func renderSystemdUnit(meta installmeta.Metadata) string {
lines := []string{
"[Unit]",
"Description=banger daemon",
"After=network-online.target",
"Wants=network-online.target " + installmeta.DefaultRootHelperService,
"After=" + installmeta.DefaultRootHelperService,
"Requires=" + installmeta.DefaultRootHelperService,
"",
"[Service]",
"Type=simple",
"User=" + meta.OwnerUser,
"ExecStart=" + systemBangerdBin + " --system",
"Restart=on-failure",
"RestartSec=1s",
// KillMode=process: only signal the main PID on stop/restart.
// The default (control-group) sends SIGKILL to every process in
// the unit's cgroup, including descendants — and during `banger
// update` we restart this unit, which would terminate any
// in-flight subprocesses spawned by the daemon. The daemon
// shuts its own children down explicitly when needed.
"KillMode=process",
"Environment=PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
"Environment=TMPDIR=/run/banger",
"UMask=0077",
"NoNewPrivileges=yes",
"PrivateMounts=yes",
"ProtectSystem=strict",
"ProtectHome=read-only",
"ProtectControlGroups=yes",
"ProtectKernelLogs=yes",
"ProtectKernelModules=yes",
"ProtectClock=yes",
"ProtectHostname=yes",
"RestrictSUIDSGID=yes",
"LockPersonality=yes",
"SystemCallArchitectures=native",
"RestrictAddressFamilies=AF_UNIX AF_INET AF_INET6 AF_NETLINK AF_VSOCK",
"StateDirectory=banger",
"StateDirectoryMode=0700",
"CacheDirectory=banger",
"CacheDirectoryMode=0700",
"RuntimeDirectory=banger",
"RuntimeDirectoryMode=0700",
// Keep /run/banger across stop/restart so the api-sock symlinks
// the helper creates for live VMs aren't wiped between the daemon
// stopping and the new daemon's reconcile re-attaching to them.
// Without this, `banger update` restarts the daemon, /run/banger
// is wiped, the api-sock symlinks vanish, and rediscoverHandles
// can't resolve the chroot path it needs to read jailer's pidfile.
"RuntimeDirectoryPreserve=yes",
}
if coverDir := strings.TrimSpace(os.Getenv(systemCoverDirEnv)); coverDir != "" {
lines = append(lines, "Environment=GOCOVERDIR="+systemdQuote(coverDir))
}
if home := strings.TrimSpace(meta.OwnerHome); home != "" {
lines = append(lines, "ReadOnlyPaths="+systemdQuote(home))
}
lines = append(lines,
"",
"[Install]",
"WantedBy=multi-user.target",
"",
)
return strings.Join(lines, "\n")
}
func renderRootHelperSystemdUnit() string {
lines := []string{
"[Unit]",
"Description=banger root helper",
"After=network-online.target",
"Wants=network-online.target",
"",
"[Service]",
"Type=simple",
"ExecStart=" + systemBangerdBin + " --root-helper",
"Restart=on-failure",
"RestartSec=1s",
// KillMode=process + SendSIGKILL=no together make the helper
// safe to restart while banger-launched firecrackers are
// running. firecracker lives in this unit's cgroup (jailer
// doesn't open a sub-cgroup), so:
//
// - Default control-group mode SIGKILLs every process in
// the cgroup on stop.
// - KillMode=process limits the initial SIGTERM to the
// helper main PID; systemd leaves remaining cgroup
// processes alone (and logs "Unit process N (firecracker)
// remains running after unit stopped").
// - SendSIGKILL=no disables the FinalKillSignal escalation
// that would otherwise SIGKILL leftovers after the timeout.
//
// One more pitfall: the firecracker SDK installs a default
// signal-forwarding goroutine in the helper that catches
// SIGTERM (etc.) and forwards it to every firecracker child.
// We disable that explicitly via ForwardSignals: []os.Signal{}
// in firecracker.buildConfig — without that override, systemd
// signaling the helper main would propagate to every running
// VM regardless of what these directives do.
//
// `banger system uninstall` and the daemon's vm-stop path
// explicitly stop firecracker processes when actually needed,
// so we don't lose the systemd-driven kill as a real safety
// net — banger drives those kills itself.
"KillMode=process",
"SendSIGKILL=no",
"Environment=PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
"Environment=TMPDIR=" + installmeta.DefaultRootHelperRuntimeDir,
"UMask=0077",
"NoNewPrivileges=yes",
"PrivateTmp=yes",
"PrivateMounts=yes",
"ProtectSystem=strict",
"ProtectHome=yes",
"ProtectControlGroups=yes",
"ProtectKernelLogs=yes",
"ProtectKernelModules=yes",
"ProtectClock=yes",
"ProtectHostname=yes",
"RestrictSUIDSGID=yes",
"LockPersonality=yes",
"SystemCallArchitectures=native",
"RestrictAddressFamilies=AF_UNIX AF_INET AF_INET6 AF_NETLINK AF_VSOCK",
"CapabilityBoundingSet=CAP_CHOWN CAP_DAC_OVERRIDE CAP_FOWNER CAP_KILL CAP_MKNOD CAP_NET_ADMIN CAP_NET_RAW CAP_SETGID CAP_SETUID CAP_SYS_ADMIN CAP_SYS_CHROOT",
"ReadWritePaths=/var/lib/banger",
"RuntimeDirectory=banger-root",
"RuntimeDirectoryMode=0711",
// Same rationale as bangerd.service: the helper-managed
// /run/banger-root holds the helper's RPC socket and any
// per-VM scratch state; preserving it across restart keeps
// the daemon's reconnect path and reconcile re-attachment
// from racing against systemd's runtime-dir cleanup.
"RuntimeDirectoryPreserve=yes",
}
if coverDir := strings.TrimSpace(os.Getenv(rootCoverDirEnv)); coverDir != "" {
lines = append(lines, "Environment=GOCOVERDIR="+systemdQuote(coverDir))
}
lines = append(lines,
"",
"[Install]",
"WantedBy=multi-user.target",
"",
)
return strings.Join(lines, "\n")
}
func systemdQuote(value string) string {
return strconv.Quote(strings.TrimSpace(value))
}
func installFile(sourcePath, targetPath string, mode os.FileMode) error {
if err := os.MkdirAll(filepath.Dir(targetPath), 0o755); err != nil {
return err
}
tempPath := targetPath + ".tmp"
_ = os.Remove(tempPath)
if err := system.CopyFilePreferClone(sourcePath, tempPath); err != nil {
return err
}
if err := os.Chmod(tempPath, mode); err != nil {
_ = os.Remove(tempPath)
return err
}
if err := os.Rename(tempPath, targetPath); err != nil {
_ = os.Remove(tempPath)
return err
}
return nil
}
func requireRoot() error {
if os.Geteuid() == 0 {
return nil
}
return errors.New("this command requires root; run it with sudo")
}
func (d *deps) runSystemctl(ctx context.Context, args ...string) error {
_, err := d.hostCommandOutput(ctx, "systemctl", args...)
return err
}
func (d *deps) systemctlQuery(ctx context.Context, args ...string) string {
output, err := d.hostCommandOutput(ctx, "systemctl", args...)
if err == nil {
return strings.TrimSpace(string(output))
}
msg := strings.TrimSpace(string(output))
if msg != "" {
return msg
}
msg = strings.TrimSpace(err.Error())
if idx := strings.LastIndex(msg, ": "); idx >= 0 {
return strings.TrimSpace(msg[idx+2:])
}
return msg
}