daemon: split owner daemon from root helper

Move the supported systemd path to two services: an owner-user bangerd for orchestration and a narrow root helper for bridge/tap, NAT/resolver, dm/loop, and Firecracker ownership. This removes repeated sudo from daily vm and image flows without leaving the general daemon running as root. Add install metadata, system install/status/restart/uninstall commands, and a system-owned runtime layout. Keep user SSH/config material in the owner home, lock file_sync to the owner home, and move daemon known_hosts handling out of the old root-owned control path. Route privileged lifecycle steps through typed privilegedOps calls, harden the two systemd units, and rewrite smoke plus docs around the supported service model. Verified with make build, make test, make lint, and make smoke on the supported systemd host path.
2026-04-26 12:43:17 -03:00 · 2026-04-26 12:43:17 -03:00 · 59e48e830b
commit 59e48e830b
parent 3edd7c6de7
53 changed files with 3239 additions and 726 deletions
--- a/scripts/smoke.sh
+++ b/scripts/smoke.sh
@ -1,34 +1,27 @@
 #!/usr/bin/env bash
 #
-# scripts/smoke.sh — end-to-end smoke suite for banger.
+# scripts/smoke.sh — end-to-end smoke suite for banger's supported
+# two-service systemd model.
 #
-# Drives a real create → start → ssh → exec → delete cycle against
-# real Firecracker + real KVM on the host. Intended as a pre-release
-# gate: the Go unit + integration tests don't and can't cover the
-# post-machine.Start path (socket ownership, guest boot, vsock agent
-# wait, guest SSH, workspace prepare). If this suite fails, don't
-# ship.
+# Installs instrumented binaries as temporary bangerd.service +
+# bangerd-root.service, drives real Firecracker/KVM scenarios, collects
+# covdata from both services plus the CLI, then purges the smoke-owned
+# install on exit.
 #
-# State lives under $BANGER_SMOKE_XDG_DIR (set by `make smoke`,
-# defaults to build/smoke/xdg). It's ISOLATED from the invoking
-# user's real banger install via XDG_{CONFIG,STATE,CACHE,RUNTIME}
-# overrides, but PERSISTED across runs — so the first smoke pulls
-# the golden image, subsequent smokes reuse it. `make smoke-clean`
-# wipes it.
+# Because the supported path is global host state, smoke refuses to
+# overwrite a pre-existing non-smoke install. If a prior smoke crashed,
+# rerun `make smoke-clean` or `make smoke`; the smoke marker lets the
+# harness purge only its own stale install safely.
 #
-# Invoked via `make smoke`, which sets the three env vars below.
-# Don't run this directly unless you know they're set.
+# Scratch files live under $BANGER_SMOKE_XDG_DIR (historic name kept for
+# make-compat). Service state uses the real supported system paths and is
+# purged by the smoke cleanup path.

 set -euo pipefail

 log() { printf '[smoke] %s\n' "$*" >&2; }
 die() { printf '[smoke] FAIL: %s\n' "$*" >&2; exit 1; }

-# wait_for_ssh polls `vm ssh <vm> -- true` until it succeeds or the
-# timeout expires. `vm ssh` — unlike `vm run` — does not itself wait
-# for guest sshd, so scenarios that call `vm create` / `vm start`
-# back-to-back with `vm ssh` need this shim. 60s matches
-# vmRunSSHTimeout.
 wait_for_ssh() {
  local vm="$1"
  local deadline=$(( $(date +%s) + 60 ))
@ -43,7 +36,7 @@ wait_for_ssh() {

 : "${BANGER_SMOKE_BIN_DIR:?must point at the instrumented binary dir, set by make smoke}"
 : "${BANGER_SMOKE_COVER_DIR:?must point at the coverage dir, set by make smoke}"
-: "${BANGER_SMOKE_XDG_DIR:?must point at the isolated XDG root, set by make smoke}"
+: "${BANGER_SMOKE_XDG_DIR:?must point at the smoke scratch root, set by make smoke}"

 BANGER="$BANGER_SMOKE_BIN_DIR/banger"
 BANGERD="$BANGER_SMOKE_BIN_DIR/bangerd"
@ -53,53 +46,108 @@ for bin in "$BANGER" "$BANGERD" "$VSOCK_AGENT"; do
  [[ -x "$bin" ]] || die "binary missing or not executable: $bin"
 done

-# Persistent XDG dirs (state, cache, config) so repeated smoke
-# runs reuse the pulled golden image instead of re-downloading
-# ~300MB each time. Runtime dir needs to be fresh per-run because
-# it holds sockets the daemon cleans up on stop and refuses to
-# reuse if any are stale.
-mkdir -p \
-  "$BANGER_SMOKE_XDG_DIR/config" \
-  "$BANGER_SMOKE_XDG_DIR/state" \
-  "$BANGER_SMOKE_XDG_DIR/cache"
-runtime_dir="$(mktemp -d -t banger-smoke-runtime-XXXXXX)"
-# shellcheck disable=SC2064
-trap "rm -rf '$runtime_dir'" EXIT
-chmod 0700 "$runtime_dir"
+scratch_root="$BANGER_SMOKE_XDG_DIR"
+runtime_dir=
+smoke_owner="$(id -un)"
+smoke_marker='/etc/banger/.smoke-owned'
+service_cover_dir='/var/lib/banger'
+owner_service='bangerd.service'
+root_service='bangerd-root.service'

-export XDG_CONFIG_HOME="$BANGER_SMOKE_XDG_DIR/config"
-export XDG_STATE_HOME="$BANGER_SMOKE_XDG_DIR/state"
-export XDG_CACHE_HOME="$BANGER_SMOKE_XDG_DIR/cache"
-export XDG_RUNTIME_DIR="$runtime_dir"
+mkdir -p "$BANGER_SMOKE_COVER_DIR"
+rm -rf "$scratch_root"
+mkdir -p "$scratch_root"
+runtime_dir="$(mktemp -d "$scratch_root/runtime-XXXXXX")"

-# Point banger at its companion binaries inside the smoke build.
-export BANGER_DAEMON_BIN="$BANGERD"
-export BANGER_VSOCK_AGENT_BIN="$VSOCK_AGENT"
-
-# Instrumented binaries dump coverage here on clean exit.
+# The CLI binary itself is instrumented, so keep its covdata local.
 export GOCOVERDIR="$BANGER_SMOKE_COVER_DIR"
-mkdir -p "$GOCOVERDIR"

-# Any smoke daemon left behind from a prior run that crashed mid-
-# scenario would reuse the stale socket path and confuse
-# ensureDaemon. Best-effort stop; ignore if nothing is running.
-"$BANGER" daemon stop >/dev/null 2>&1 || true
+cleanup_export_vm() {
+  "$BANGER" vm delete smoke-export >/dev/null 2>&1 || true
+}

-# banger's vmDNS binds 127.0.0.1:42069 (UDP) hard. If the user's
-# real (non-smoke) daemon is running, its listener holds the port
-# and the smoke daemon's Open() fails before any scenario runs.
-# Fail fast with an actionable message — don't guess whether to
-# stop the user's daemon for them.
-if command -v ss >/dev/null 2>&1 && ss -Huln 2>/dev/null | awk '{print $4}' | grep -q '[:.]42069$'; then
-  die 'port 127.0.0.1:42069 is already bound (likely your real banger daemon); stop it with `banger daemon stop` and re-run `make smoke`'
+cleanup_prune() {
+  "$BANGER" vm delete smoke-prune-running >/dev/null 2>&1 || true
+  "$BANGER" vm delete smoke-prune-stopped >/dev/null 2>&1 || true
+}
+
+collect_service_coverage() {
+  local uid gid
+  uid="$(id -u)"
+  gid="$(id -g)"
+  sudo bash -lc '
+    set -euo pipefail
+    shopt -s nullglob
+    dst="$1"
+    uid="$2"
+    gid="$3"
+    src="$4"
+    for file in "$src"/covmeta.* "$src"/covcounters.*; do
+      base="${file##*/}"
+      cp "$file" "$dst/$base"
+      chown "$uid:$gid" "$dst/$base"
+      chmod 0644 "$dst/$base"
+    done
+  ' bash "$BANGER_SMOKE_COVER_DIR" "$uid" "$gid" "$service_cover_dir"
+}
+
+stop_services_for_coverage() {
+  sudo systemctl stop "$owner_service" "$root_service" >/dev/null 2>&1 || true
+}
+
+sudo_banger() {
+  sudo env GOCOVERDIR="$BANGER_SMOKE_COVER_DIR" "$@"
+}
+
+cleanup() {
+  set +e
+  for vm in \
+    smoke-lifecycle smoke-set smoke-restart smoke-kill smoke-ports smoke-fc \
+    smoke-basecommit smoke-nat smoke-nocnat; do
+    "$BANGER" vm delete "$vm" >/dev/null 2>&1 || true
+  done
+  cleanup_export_vm
+  cleanup_prune
+  stop_services_for_coverage
+  collect_service_coverage
+  sudo_banger "$BANGER" system uninstall --purge >/dev/null 2>&1 || true
+  rm -rf "$scratch_root"
+}
+trap cleanup EXIT
+
+if sudo test -f /etc/banger/install.toml; then
+  if sudo test -f "$smoke_marker"; then
+    log 'found stale smoke-owned install; purging it first'
+    sudo_banger "$BANGER" system uninstall --purge >/dev/null 2>&1 || true
+  else
+    die 'banger is already installed on this host; supported-path smoke refuses to overwrite a non-smoke install'
+  fi
 fi

-# --- doctor -----------------------------------------------------------
+log 'installing smoke-owned services'
+sudo env \
+  GOCOVERDIR="$BANGER_SMOKE_COVER_DIR" \
+  BANGER_SYSTEM_GOCOVERDIR="$service_cover_dir" \
+  BANGER_ROOT_HELPER_GOCOVERDIR="$service_cover_dir" \
+  "$BANGER" system install --owner "$smoke_owner" >/dev/null \
+  || die 'system install failed'
+sudo touch "$smoke_marker"
+
+status_out="$("$BANGER" system status)" || die 'system status failed after install'
+grep -q 'active: active' <<<"$status_out" || die "owner daemon not active after install: $status_out"
+grep -q 'helper_active: active' <<<"$status_out" || die "root helper not active after install: $status_out"
+
 log 'doctor: checking host readiness'
 if ! "$BANGER" doctor; then
  die 'doctor reported failures; fix the host before running smoke'
 fi

+log 'system restart: services should come back cleanly'
+sudo_banger "$BANGER" system restart >/dev/null || die 'system restart failed'
+status_out="$("$BANGER" system status)" || die 'system status failed after restart'
+grep -q 'active: active' <<<"$status_out" || die "owner daemon not active after restart: $status_out"
+grep -q 'helper_active: active' <<<"$status_out" || die "root helper not active after restart: $status_out"
+
 # --- bare vm run ------------------------------------------------------
 log "bare vm run: create + start + ssh + exec 'echo smoke-bare-ok' + --rm"
 bare_out="$("$BANGER" vm run --rm -- echo smoke-bare-ok)" || die "bare vm run exit $?"
@ -125,11 +173,6 @@ ws_out="$("$BANGER" vm run --rm "$repodir" -- cat /root/repo/smoke-file.txt)" ||
 grep -q 'smoke-workspace-marker' <<<"$ws_out" || die "workspace vm run didn't ship smoke-file.txt: $ws_out"

 # --- command exit-code propagation ------------------------------------
-# A non-zero exit from the guest command must surface as banger's own
-# exit code. Regressions here are hard to catch any other way — the
-# local Go tests don't cross the SSH boundary, and users expect their
-# CI scripts that wrap `banger vm run` to fail when the thing inside
-# the VM failed.
 log 'exit-code propagation: guest `sh -c "exit 42"` must produce rc=42'
 set +e
 "$BANGER" vm run --rm -- sh -c 'exit 42'
@ -138,66 +181,35 @@ set -e
 [[ "$rc" -eq 42 ]] || die "exit-code propagation: got rc=$rc, want 42"

 # --- workspace dry-run (no VM) ----------------------------------------
-# Pure CLI-side path — no VM, no sudo, just the local git inspection
-# against d.repoInspector. Fast; catches regressions in the preview
-# output (file list shape, mode line) that the Go tests already pin
-# but that could still be broken by a client-side wiring change.
 log 'workspace dry-run: list tracked files without creating a VM'
 dry_out="$("$BANGER" vm run --dry-run "$repodir")" || die "dry-run exit $?"
 grep -q 'smoke-file.txt' <<<"$dry_out" || die "dry-run didn't list smoke-file.txt: $dry_out"
 grep -q 'mode: tracked only' <<<"$dry_out" || die "dry-run mode line missing or wrong: $dry_out"

 # --- workspace --include-untracked -----------------------------------
-# The default is tracked-only (review cycle 4). Opt-in must ship
-# untracked files too. Write one, run with --include-untracked, verify
-# it reaches the guest.
 log 'workspace --include-untracked: opt-in ships files outside the git index'
 echo 'untracked-marker' > "$repodir/smoke-untracked.txt"
 inc_out="$("$BANGER" vm run --rm --include-untracked "$repodir" -- cat /root/repo/smoke-untracked.txt)" || die "include-untracked vm run exit $?"
 grep -q 'untracked-marker' <<<"$inc_out" || die "--include-untracked didn't ship the untracked file: $inc_out"
-# Restore repo to tracked-only state for any later scenarios.
 rm -f "$repodir/smoke-untracked.txt"

 # --- workspace export round-trip --------------------------------------
-# Exercises ExportVMWorkspace: create a VM, prepare the workspace,
-# write a new file inside the guest, then export and assert the
-# emitted patch sees the guest-side change. If the export pipeline
-# (temp-index, git add -A, diff --binary) ever stops capturing
-# guest-side changes, this scenario catches it.
 log 'workspace export: create + prepare + guest edit + export + assert marker'
-export_vm='smoke-export'
-cleanup_export_vm() {
-  "$BANGER" vm delete "$export_vm" >/dev/null 2>&1 || true
-}
-# Chain the VM cleanup with the existing runtime_dir trap so a mid-
-# scenario failure still tears the VM down before the script exits.
-# shellcheck disable=SC2064
-trap "cleanup_export_vm; rm -rf '$runtime_dir'" EXIT
-
-"$BANGER" vm create --name "$export_vm" --image debian-bookworm >/dev/null \
+"$BANGER" vm create --name smoke-export --image debian-bookworm >/dev/null \
  || die "export: vm create exit $?"
-"$BANGER" vm workspace prepare "$export_vm" "$repodir" >/dev/null \
+"$BANGER" vm workspace prepare smoke-export "$repodir" >/dev/null \
  || die "export: workspace prepare exit $?"
-"$BANGER" vm ssh "$export_vm" -- sh -c 'echo guest-edit > /root/repo/new-guest-file.txt' \
+"$BANGER" vm ssh smoke-export -- sh -c 'echo guest-edit > /root/repo/new-guest-file.txt' \
  || die "export: guest-side file write exit $?"
 export_patch="$runtime_dir/smoke-export.diff"
-"$BANGER" vm workspace export "$export_vm" --output "$export_patch" \
+"$BANGER" vm workspace export smoke-export --output "$export_patch" \
  || die "export: workspace export exit $?"
 [[ -s "$export_patch" ]] || die "export: patch file empty at $export_patch"
 grep -q 'new-guest-file.txt' "$export_patch" \
  || die "export: patch missing new-guest-file.txt marker (head: $(head -c 400 "$export_patch"))"
-
 cleanup_export_vm
-# shellcheck disable=SC2064
-trap "rm -rf '$runtime_dir'" EXIT

 # --- concurrent vm runs -----------------------------------------------
-# Stresses per-VM lock scoping, the tap pool warm-up path, and
-# createVMMu's narrow reservation window. Two `vm run --rm` invocations
-# that actually overlap should both succeed. A regression that
-# serialises create path too aggressively would make this slow but
-# still pass; a regression that breaks tap allocation or name
-# uniqueness would fail one of them.
 log 'concurrent vm runs: two --rm invocations must both succeed'
 tmpA="$runtime_dir/concurrent-a.out"
 tmpB="$runtime_dir/concurrent-b.out"
@ -211,18 +223,8 @@ grep -q 'smoke-concurrent-a' "$tmpA" || die "concurrent VM A missing marker: $(c
 grep -q 'smoke-concurrent-b' "$tmpB" || die "concurrent VM B missing marker: $(cat "$tmpB")"

 # --- vm lifecycle (create → stop → start → delete) --------------------
-# Exercises lifecycle verbs directly instead of the --rm convenience
-# path. The critical assertion is the second `vm ssh` AFTER stop/start:
-# that path (a) rebuilds the handle cache via rediscoverHandles,
-# (b) runs the e2fsck-snapshot sanitize step before patchRootOverlay
-# on the dirty COW, and (c) shouldn't die from the SDK's
-# ctx-SIGTERM-on-RPC-close goroutine. All three were bugs at one
-# point; this scenario guards all three at once.
 log 'vm lifecycle: explicit create / stop / start / ssh / delete'
 lifecycle_name=smoke-lifecycle
-# shellcheck disable=SC2064
-trap "\"$BANGER\" vm delete $lifecycle_name >/dev/null 2>&1 || true; rm -rf '$runtime_dir'" EXIT
-
 "$BANGER" vm create --name "$lifecycle_name" >/dev/null || die "vm create $lifecycle_name failed"
 show_out="$("$BANGER" vm show "$lifecycle_name")" || die "vm show after create failed"
 grep -q '"state": "running"' <<<"$show_out" || die "post-create state not running: $show_out"
@ -249,18 +251,9 @@ set +e
 rc=$?
 set -e
 [[ "$rc" -ne 0 ]] || die "vm show still finds $lifecycle_name after delete"
-# shellcheck disable=SC2064
-trap "rm -rf '$runtime_dir'" EXIT

 # --- vm set reconfiguration (vcpu change + restart) -------------------
-# Exercises SetVM + configChangeCapability. Create with --vcpu 2,
-# stop, `vm set --vcpu 4`, restart, confirm the guest sees the new
-# count. Regression guard: a restart that reuses the pre-change spec
-# would leave nproc at 2.
 log 'vm set: create --vcpu 2 → stop → set --vcpu 4 → restart → nproc=4'
-# shellcheck disable=SC2064
-trap "\"$BANGER\" vm delete smoke-set >/dev/null 2>&1 || true; rm -rf '$runtime_dir'" EXIT
-
 "$BANGER" vm create --name smoke-set --vcpu 2 >/dev/null || die 'vm set: create failed'
 wait_for_ssh smoke-set || die 'vm set: initial ssh did not come up'

@ -286,19 +279,9 @@ set -e
  || die "vm set: post-reconfig nproc got '$nproc_after', want 4 (spec change didn't land)"

 "$BANGER" vm delete smoke-set >/dev/null || die 'vm set: delete failed'
-# shellcheck disable=SC2064
-trap "rm -rf '$runtime_dir'" EXIT

 # --- vm restart (dedicated verb) --------------------------------------
-# `vm restart` is its own verb, not a stop+start composite at the API
-# level — it must end up with a freshly booted guest. The assertion is
-# a fresh boot ID: /proc/sys/kernel/random/boot_id changes on every
-# kernel boot, so post-restart != pre-restart proves the kernel was
-# actually recycled rather than the verb no-op'ing.
 log 'vm restart: boot_id must change across the verb'
-# shellcheck disable=SC2064
-trap "\"$BANGER\" vm delete smoke-restart >/dev/null 2>&1 || true; rm -rf '$runtime_dir'" EXIT
-
 "$BANGER" vm create --name smoke-restart >/dev/null || die 'vm restart: create failed'
 wait_for_ssh smoke-restart || die 'vm restart: initial ssh never came up'
 boot_before="$("$BANGER" vm ssh smoke-restart -- cat /proc/sys/kernel/random/boot_id | tr -d '[:space:]')"
@ -312,19 +295,9 @@ boot_after="$("$BANGER" vm ssh smoke-restart -- cat /proc/sys/kernel/random/boot
  || die "vm restart: boot_id unchanged ($boot_before); verb didn't actually reboot the guest"

 "$BANGER" vm delete smoke-restart >/dev/null || die 'vm restart: delete failed'
-# shellcheck disable=SC2064
-trap "rm -rf '$runtime_dir'" EXIT

 # --- vm kill (--signal KILL, forceful path) ---------------------------
-# `vm stop` takes the graceful Ctrl-Alt-Del route. `vm kill --signal
-# KILL` is the explicit "the guest is wedged, drop it" path. It must
-# (a) terminate firecracker, (b) leave the VM record in a stopped
-# state (not 'error'), (c) tear down the dm-snapshot + loops so the
-# next create/start doesn't trip over leftovers.
 log 'vm kill --signal KILL: forceful terminate, state=stopped, no leaked dm device'
-# shellcheck disable=SC2064
-trap "\"$BANGER\" vm delete smoke-kill >/dev/null 2>&1 || true; rm -rf '$runtime_dir'" EXIT
-
 "$BANGER" vm create --name smoke-kill >/dev/null || die 'vm kill: create failed'
 dm_name="$("$BANGER" vm show smoke-kill 2>/dev/null | awk -F'"' '/"dm_dev"|fc-rootfs-/ {for(i=1;i<=NF;i++) if($i~/^fc-rootfs-/) print $i}' | head -1 || true)"
 "$BANGER" vm kill --signal KILL smoke-kill >/dev/null || die 'vm kill: verb failed'
@ -336,22 +309,9 @@ if [[ -n "$dm_name" ]]; then
  fi
 fi
 "$BANGER" vm delete smoke-kill >/dev/null || die 'vm kill: delete failed'
-# shellcheck disable=SC2064
-trap "rm -rf '$runtime_dir'" EXIT

 # --- vm prune (-f) ----------------------------------------------------
-# Create two VMs: one running, one stopped. `vm prune -f` must delete
-# the stopped one and leave the running one alone. Skip interactive
-# confirmation with -f (smoke has no tty). Regression guard: a bug
-# that deleted the running VM would wreck any session the user had.
 log 'vm prune -f: removes stopped VMs, preserves running ones'
-cleanup_prune() {
-  "$BANGER" vm delete smoke-prune-running >/dev/null 2>&1 || true
-  "$BANGER" vm delete smoke-prune-stopped >/dev/null 2>&1 || true
-}
-# shellcheck disable=SC2064
-trap "cleanup_prune; rm -rf '$runtime_dir'" EXIT
-
 "$BANGER" vm create --name smoke-prune-running >/dev/null || die 'vm prune: create running failed'
 "$BANGER" vm create --name smoke-prune-stopped >/dev/null || die 'vm prune: create stopped failed'
 "$BANGER" vm stop smoke-prune-stopped >/dev/null || die 'vm prune: stop the stopped one failed'
@ -364,20 +324,9 @@ if "$BANGER" vm show smoke-prune-stopped >/dev/null 2>&1; then
 fi

 "$BANGER" vm delete smoke-prune-running >/dev/null || die 'vm prune: cleanup delete failed'
-# shellcheck disable=SC2064
-trap "rm -rf '$runtime_dir'" EXIT

 # --- vm ports ---------------------------------------------------------
-# sshd binds :22 in every guest — it's the minimum promise of a VM.
-# If `vm ports` can't see that, the host→guest port visibility pipe
-# (vsock-agent on-demand query, daemon aggregation, CLI rendering) is
-# broken. Endpoint shape is also asserted: daemon prefers the
-# <name>.vm DNS record over the raw guest IP, so we grep for the
-# name form.
 log 'vm ports: sshd :22 visible from host, endpoint uses the VM DNS name'
-# shellcheck disable=SC2064
-trap "\"$BANGER\" vm delete smoke-ports >/dev/null 2>&1 || true; rm -rf '$runtime_dir'" EXIT
-
 "$BANGER" vm create --name smoke-ports >/dev/null || die 'vm ports: create failed'
 wait_for_ssh smoke-ports || die 'vm ports: ssh did not come up'

@ -389,18 +338,9 @@ grep -q 'sshd' <<<"$ports_out" \
  || die "vm ports: expected process 'sshd' in output; got: $ports_out"

 "$BANGER" vm delete smoke-ports >/dev/null || die 'vm ports: delete failed'
-# shellcheck disable=SC2064
-trap "rm -rf '$runtime_dir'" EXIT

 # --- workspace prepare --mode full_copy -------------------------------
-# Default mode is shallow_overlay. full_copy copies the repo via a
-# different transfer path (tar stream into the guest's rootfs with
-# no overlay). Smoke asserts it still lands the content at the same
-# guest path.
 log 'workspace prepare --mode full_copy: alternate transfer path still delivers'
-# shellcheck disable=SC2064
-trap "\"$BANGER\" vm delete smoke-fc >/dev/null 2>&1 || true; rm -rf '$runtime_dir'" EXIT
-
 "$BANGER" vm create --name smoke-fc >/dev/null || die 'workspace fc: create failed'
 "$BANGER" vm workspace prepare smoke-fc "$repodir" --mode full_copy >/dev/null \
  || die 'workspace fc: prepare --mode full_copy failed'
@ -410,42 +350,26 @@ grep -q 'smoke-workspace-marker' <<<"$fc_out" \
  || die "workspace fc: marker missing in full_copy workspace: $fc_out"

 "$BANGER" vm delete smoke-fc >/dev/null || die 'workspace fc: delete failed'
-# shellcheck disable=SC2064
-trap "rm -rf '$runtime_dir'" EXIT

 # --- workspace export --base-commit (committed guest delta) -----------
-# Without --base-commit, export diffs the worktree against HEAD — it
-# misses commits the worker made inside the guest (because the guest
-# HEAD advanced). With --base-commit pinned at the prepare-time SHA,
-# those commits land in the patch. This is the happy path the feature
-# was added for; zero coverage until now.
 log 'workspace export --base-commit: guest-side commits captured in patch'
-# shellcheck disable=SC2064
-trap "\"$BANGER\" vm delete smoke-basecommit >/dev/null 2>&1 || true; rm -rf '$runtime_dir'" EXIT
-
 "$BANGER" vm create --name smoke-basecommit >/dev/null || die 'export base: create failed'
 "$BANGER" vm workspace prepare smoke-basecommit "$repodir" >/dev/null \
  || die 'export base: prepare failed'

-# Capture the prepare-time HEAD from the guest directly (same SHA the
-# daemon returns as HeadCommit in the RPC result).
 base_sha="$("$BANGER" vm ssh smoke-basecommit -- sh -c 'cd /root/repo && git rev-parse HEAD' | tr -d '[:space:]')"
 [[ "${#base_sha}" -eq 40 ]] || die "export base: bad base sha: $base_sha"

-# Make a guest-side commit: new file + git add + git commit. Without
-# --base-commit, this commit would be invisible to a HEAD-relative diff.
 "$BANGER" vm ssh smoke-basecommit -- sh -c "cd /root/repo && git -c user.email=smoke@smoke -c user.name=smoke checkout -b smoke-branch >/dev/null 2>&1 && echo committed-marker > smoke-committed.txt && git add smoke-committed.txt && git -c user.email=smoke@smoke -c user.name=smoke commit -q -m 'guest side'" \
  || die 'export base: guest-side commit failed'

-# Control: plain export (no --base-commit) must NOT see the committed file.
 plain_patch="$runtime_dir/smoke-plain.diff"
 "$BANGER" vm workspace export smoke-basecommit --output "$plain_patch" \
  || die 'export base: plain export failed'
-if grep -q 'smoke-committed.txt' "$plain_patch"; then
+if [[ -f "$plain_patch" ]] && grep -q 'smoke-committed.txt' "$plain_patch"; then
  die 'export base: plain export unexpectedly captured the guest-side commit'
 fi

-# With --base-commit pinned to the pre-commit SHA, the delta appears.
 base_patch="$runtime_dir/smoke-base.diff"
 "$BANGER" vm workspace export smoke-basecommit --base-commit "$base_sha" --output "$base_patch" \
  || die 'export base: --base-commit export failed'
@ -454,21 +378,11 @@ grep -q 'smoke-committed.txt' "$base_patch" \
  || die "export base: --base-commit patch missing committed marker (head: $(head -c 400 "$base_patch"))"

 "$BANGER" vm delete smoke-basecommit >/dev/null || die 'export base: delete failed'
-# shellcheck disable=SC2064
-trap "rm -rf '$runtime_dir'" EXIT

 # --- ssh-config install / uninstall (HOME-isolated) -------------------
-# `banger ssh-config --install` edits ~/.ssh/config. Smoke runs under
-# the invoking user, so we isolate by pointing HOME at the smoke XDG
-# dir before the commands run (os.UserHomeDir respects $HOME on
-# Linux). No daemon / VM involved — pure CLI + filesystem surface,
-# exercising the install/status/uninstall code paths end-to-end.
 log 'ssh-config --install / --uninstall: idempotent, survives round-trip'
-fake_home="$BANGER_SMOKE_XDG_DIR/fake-home"
+fake_home="$scratch_root/fake-home"
 mkdir -p "$fake_home/.ssh"
-# Seed a pre-existing ~/.ssh/config so install must APPEND, not
-# replace. A bug that clobbered pre-existing content would nuke the
-# user's real config on first run.
 printf 'Host myserver\n  HostName example.invalid\n' > "$fake_home/.ssh/config"

 (
@ -479,7 +393,6 @@ printf 'Host myserver\n  HostName example.invalid\n' > "$fake_home/.ssh/config"
  grep -q '^Host myserver' "$fake_home/.ssh/config" \
    || die 'ssh-config: install clobbered pre-existing content (!!)'

-  # Second install must be idempotent (no duplicate Include lines).
  "$BANGER" ssh-config --install >/dev/null || die 'ssh-config: second install failed'
  include_count="$(grep -c '^Include .*banger' "$fake_home/.ssh/config")"
  [[ "$include_count" == "1" ]] \
@ -494,22 +407,10 @@ printf 'Host myserver\n  HostName example.invalid\n' > "$fake_home/.ssh/config"
 )

 # --- NAT rule installation (per-VM MASQUERADE) ------------------------
-# `--nat` installs a per-VM iptables POSTROUTING MASQUERADE rule
-# scoped to the guest's /32 (see natCapability). End-to-end curl
-# tests don't work here because the bridge IP and the host's uplink
-# IP both belong to the host — a guest reaching the uplink address
-# lands on the host's local loopback whether MASQUERADE is set up
-# or not. So assert the rule itself: NAT VM gets a POSTROUTING
-# MASQUERADE, non-NAT VM does not. This catches the two most
-# plausible regressions (rule never installed; rule not scoped to
-# the right VM) without depending on an external reachable host.
 log 'NAT: --nat installs a per-VM MASQUERADE rule; no --nat means no rule'
 if ! sudo -n iptables -t nat -S POSTROUTING >/dev/null 2>&1; then
  log 'NAT: skipping — passwordless sudo iptables unavailable'
 else
-  # shellcheck disable=SC2064
-  trap "\"$BANGER\" vm delete smoke-nat >/dev/null 2>&1 || true; \"$BANGER\" vm delete smoke-nocnat >/dev/null 2>&1 || true; rm -rf '$runtime_dir'" EXIT
-
  "$BANGER" vm create --name smoke-nat --nat >/dev/null || die 'NAT: create --nat failed'
  "$BANGER" vm create --name smoke-nocnat >/dev/null || die 'NAT: control create failed'

@ -524,9 +425,6 @@ else
    die "NAT: control VM unexpectedly has a MASQUERADE rule for $ctl_ip"
  fi

-  # Stop + start the --nat VM to exercise the install-is-idempotent
-  # path (capability runs again on each start; a buggy add-without-
-  # check would leave two identical rules behind).
  "$BANGER" vm stop smoke-nat >/dev/null || die 'NAT: stop --nat VM failed'
  "$BANGER" vm start smoke-nat >/dev/null || die 'NAT: restart --nat VM failed'
  postrouting="$(sudo -n iptables -t nat -S POSTROUTING 2>/dev/null || true)"
@ -534,7 +432,6 @@ else
  [[ "$rule_count" == "1" ]] \
    || die "NAT: MASQUERADE rule count for $nat_ip = $rule_count after restart, want 1"

-  # Delete must tear the rule down — regression guard against leaks.
  "$BANGER" vm delete smoke-nat >/dev/null || die 'NAT: delete --nat VM failed'
  "$BANGER" vm delete smoke-nocnat >/dev/null || die 'NAT: delete control VM failed'
  postrouting="$(sudo -n iptables -t nat -S POSTROUTING 2>/dev/null || true)"
@ -542,15 +439,8 @@ else
    die "NAT: delete left a MASQUERADE rule behind for $nat_ip"
  fi
 fi
-# shellcheck disable=SC2064
-trap "rm -rf '$runtime_dir'" EXIT

 # --- invalid spec rejection + no artifact leak ------------------------
-# Tests the negative-path create flow: a blatantly invalid VM spec
-# must fail before any VM row is persisted. The review cycle flagged
-# "cleanup on partial failure" as under-tested; this scenario pins
-# that a rejected create doesn't leak a reservation we then have to
-# clean up by hand.
 log 'invalid spec rejection: --vcpu 0 must fail and leave no VM behind'
 pre_vms="$("$BANGER" vm list --all 2>/dev/null | wc -l)"
 set +e
@ -562,13 +452,6 @@ post_vms="$("$BANGER" vm list --all 2>/dev/null | wc -l)"
 [[ "$pre_vms" == "$post_vms" ]] || die "invalid spec leaked a VM row: pre=$pre_vms, post=$post_vms"

 # --- invalid name rejection ------------------------------------------
-# VM names become DNS labels, guest hostnames, kernel-cmdline tokens
-# and file-path fragments — the validator (ValidateVMName) must reject
-# anything that isn't [a-z0-9-] with no leading/trailing hyphen and no
-# dots. Smoke covers a few of the worst offenders end-to-end through
-# the CLI; the full character-class matrix lives in
-# internal/model/vm_name_test.go. Rejected names must also leave no
-# VM row behind.
 log 'invalid name rejection: uppercase / space / dot / leading-hyphen must all fail'
 pre_vms="$("$BANGER" vm list --all 2>/dev/null | wc -l)"
 for bad in 'MyBox' 'my box' 'box.vm' '-box'; do
@ -582,11 +465,4 @@ post_vms="$("$BANGER" vm list --all 2>/dev/null | wc -l)"
 [[ "$pre_vms" == "$post_vms" ]] \
  || die "invalid name leaked VM row(s): pre=$pre_vms, post=$post_vms"

-# --- daemon stop (flushes coverage) -----------------------------------
-log 'stopping daemon so instrumented binaries flush coverage'
-"$BANGER" daemon stop >/dev/null 2>&1 || true
-# Give the daemon a moment to write its covdata pod before the trap
-# tears down runtime_dir.
-sleep 0.5
-
 log 'all scenarios passed'