From 5f81332b0aa17bb850aedd4f40495cfb4168383a Mon Sep 17 00:00:00 2001 From: Thales Maciel Date: Wed, 22 Apr 2026 18:59:57 -0300 Subject: [PATCH] make smoke: end-to-end boot suite with coverage from real VM runs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The unit + integration tests can't cross machine.Start — the SDK boundary would need a fake firecracker that reimplements the control-plane HTTP API, and the ongoing maintenance cost of keeping that fake honest with upstream kills the value. Instead, add a pre-release smoke target that drives REAL Firecracker + real KVM, captures coverage from the -cover-instrumented binaries, and surfaces per-package deltas so regressions in the boot path don't ship silently. scripts/smoke.sh: - Isolated XDG_{CONFIG,STATE,CACHE,RUNTIME} so the smoke run can't touch real user state (state/cache persist under build/smoke/xdg for fast reruns; runtime is mktemp'd fresh per-run because sockets can't be reused) - Preflight: `banger doctor` must pass; UDP :42069 must be free (otherwise the user's real daemon is up and the smoke daemon can't bind its DNS listener — fail with an actionable message) - Scenario 1 — bare: `banger vm run --rm -- echo smoke-bare-ok` exercises create → start → socket ownership chown → machine.Start → SDK waitForSocket race → vsock agent readiness → guest SSH wait → exec → cleanup → delete - Scenario 2 — workspace: creates a throwaway git repo, runs `banger vm run --rm -- cat /root/repo/smoke-file.txt`, verifies the tracked file reached the guest (exercises workDisk capability PrepareHost + workspace.prepare) - `banger daemon stop` at the end so instrumented binaries flush GOCOVERDIR pods before the script exits Makefile additions: - smoke-build: builds banger/bangerd under build/smoke/bin/ with `go build -cover` - smoke: runs the script with GOCOVERDIR set, reports per-package coverage via `go tool covdata percent` - smoke-coverage-html: textfmt + go tool cover for a browsable report - smoke-clean: nukes build/smoke/ including the persisted XDG state Bonus fix uncovered during the first smoke run: doctor treated a missing state.db as a FAIL ("out of memory" from SQLite SQLITE_CANTOPEN), which red-flagged every fresh install. Split the store check: DB file absent → PASS with "will be created on first daemon start" detail; DB present but unreadable → FAIL as before. New TestDoctorReport_StoreMissingSurfacesAsPassForFreshInstall pins the behaviour. Concrete coverage delta from the first successful smoke run (compared to `make coverage-total`'s unit-test-only 37.8%): internal/firecracker 43.6% → 75.0% internal/daemon/workspace 33.8% → 60.8% internal/store 40.1% → 56.3% internal/guest 63.7% → 57.4% (different mix: smoke exercises real SSH; unit tests cover more error branches) The packages the review flagged are the ones that moved most — which is the point. Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitignore | 1 + Makefile | 50 +++++++++++++- internal/daemon/doctor.go | 39 +++++++---- internal/daemon/doctor_test.go | 32 +++++++-- scripts/smoke.sh | 117 +++++++++++++++++++++++++++++++++ 5 files changed, 221 insertions(+), 18 deletions(-) create mode 100755 scripts/smoke.sh diff --git a/.gitignore b/.gitignore index cab6aed..cb1a133 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ state/ /build/bin/ +/build/smoke/ build/manual/ /runtime/ /dist/ diff --git a/Makefile b/Makefile index bf2954c..7a4a5d0 100644 --- a/Makefile +++ b/Makefile @@ -21,6 +21,11 @@ GO_SOURCES := $(shell find cmd internal -type f -name '*.go' | sort) # any redundant invocations. BUILD_INPUTS := $(shell find cmd internal -type f | sort) SHELL_SOURCES := $(shell find scripts -type f -name '*.sh' | sort) +SMOKE_DIR := $(BUILD_DIR)/smoke +SMOKE_BIN_DIR := $(SMOKE_DIR)/bin +SMOKE_COVER_DIR := $(SMOKE_DIR)/covdata +SMOKE_XDG_DIR := $(SMOKE_DIR)/xdg +SMOKE_SCRIPT := scripts/smoke.sh VERSION ?= $(shell git describe --tags --exact-match 2>/dev/null || echo dev) COMMIT ?= $(shell git rev-parse --verify HEAD 2>/dev/null || echo unknown) BUILT_AT ?= $(shell date -u +%Y-%m-%dT%H:%M:%SZ) @@ -28,7 +33,7 @@ GO_LDFLAGS := -X banger/internal/buildinfo.Version=$(VERSION) -X banger/internal .DEFAULT_GOAL := help -.PHONY: help build banger bangerd test fmt tidy clean install uninstall lint lint-go lint-shell coverage coverage-html coverage-total +.PHONY: help build banger bangerd test fmt tidy clean install uninstall lint lint-go lint-shell coverage coverage-html coverage-total smoke smoke-build smoke-coverage-html smoke-clean help: @printf '%s\n' \ @@ -43,7 +48,10 @@ help: ' make lint Run gofmt + go vet + shellcheck (errors)' \ ' make fmt Format Go sources under cmd/ and internal/' \ ' make tidy Run go mod tidy' \ - ' make clean Remove built Go binaries and coverage artefacts' + ' make clean Remove built Go binaries and coverage artefacts' \ + ' make smoke Build instrumented binaries, run scripts/smoke.sh, report coverage (needs KVM + sudo)' \ + ' make smoke-coverage-html HTML coverage report from the last smoke run' \ + ' make smoke-clean Remove the smoke build tree' build: $(BINARIES) @@ -101,6 +109,44 @@ tidy: clean: rm -rf "$(BUILD_BIN_DIR)" coverage.out coverage.html +# Smoke test suite. Builds the three banger binaries with -cover +# instrumentation under $(SMOKE_BIN_DIR), runs scripts/smoke.sh +# with GOCOVERDIR pointed at $(SMOKE_COVER_DIR), and prints the +# resulting coverage. The smoke script fully isolates state via +# XDG_* env vars pointing at a mktemp'd root, so the invoking +# user's real banger install stays untouched. +# +# Requires a KVM-capable Linux host with sudo; fails fast via +# `banger doctor` when either is missing. This is a pre-release +# gate, not CI — the Go test suite is what runs everywhere. +smoke-build: $(SMOKE_BIN_DIR)/.built + +$(SMOKE_BIN_DIR)/.built: $(BUILD_INPUTS) go.mod go.sum + mkdir -p "$(SMOKE_BIN_DIR)" + $(GO) build -cover -ldflags '$(GO_LDFLAGS)' -o "$(SMOKE_BIN_DIR)/banger" ./cmd/banger + $(GO) build -cover -ldflags '$(GO_LDFLAGS)' -o "$(SMOKE_BIN_DIR)/bangerd" ./cmd/bangerd + CGO_ENABLED=0 GOOS=linux GOARCH=amd64 $(GO) build -ldflags '$(GO_LDFLAGS)' -o "$(SMOKE_BIN_DIR)/banger-vsock-agent" ./cmd/banger-vsock-agent + touch "$@" + +smoke: smoke-build + rm -rf "$(SMOKE_COVER_DIR)" + mkdir -p "$(SMOKE_COVER_DIR)" "$(SMOKE_XDG_DIR)" + BANGER_SMOKE_BIN_DIR="$(abspath $(SMOKE_BIN_DIR))" \ + BANGER_SMOKE_COVER_DIR="$(abspath $(SMOKE_COVER_DIR))" \ + BANGER_SMOKE_XDG_DIR="$(abspath $(SMOKE_XDG_DIR))" \ + bash "$(SMOKE_SCRIPT)" + @echo '' + @echo 'Smoke coverage:' + @$(GO) tool covdata percent -i="$(SMOKE_COVER_DIR)" + +smoke-coverage-html: smoke + $(GO) tool covdata textfmt -i="$(SMOKE_COVER_DIR)" -o="$(SMOKE_DIR)/cover.out" + $(GO) tool cover -html="$(SMOKE_DIR)/cover.out" -o "$(SMOKE_DIR)/cover.html" + @echo 'wrote $(SMOKE_DIR)/cover.html' + +smoke-clean: + rm -rf "$(SMOKE_DIR)" + install: build mkdir -p "$(DESTDIR)$(BINDIR)" mkdir -p "$(DESTDIR)$(LIBDIR)/banger" diff --git a/internal/daemon/doctor.go b/internal/daemon/doctor.go index 04bb49f..bb0e57d 100644 --- a/internal/daemon/doctor.go +++ b/internal/daemon/doctor.go @@ -26,35 +26,52 @@ func Doctor(ctx context.Context) (system.Report, error) { } // Doctor must be read-only: running it should never mutate the // state DB (no migrations, no WAL checkpoint, no pragma writes). - // If the DB is missing or unreadable the storeErr path surfaces - // it as a failing check rather than half-opening a writable - // handle. - db, storeErr := store.OpenReadOnly(layout.DBPath) + // Skip OpenReadOnly entirely when the DB file doesn't exist — + // that's a fresh install, not an error condition. The first + // daemon start will create the file. storeMissing differentiates + // "no DB yet" (pass) from "DB present but unreadable" (fail) in + // the report. d := &Daemon{ layout: layout, config: cfg, runner: system.NewRunner(), } - if storeErr == nil { - defer db.Close() - d.store = db + var storeErr error + storeMissing := false + if _, statErr := os.Stat(layout.DBPath); statErr != nil { + if os.IsNotExist(statErr) { + storeMissing = true + } else { + storeErr = statErr + } + } else { + db, err := store.OpenReadOnly(layout.DBPath) + if err != nil { + storeErr = err + } else { + defer db.Close() + d.store = db + } } wireServices(d) - return d.doctorReport(ctx, storeErr), nil + return d.doctorReport(ctx, storeErr, storeMissing), nil } -func (d *Daemon) doctorReport(ctx context.Context, storeErr error) system.Report { +func (d *Daemon) doctorReport(ctx context.Context, storeErr error, storeMissing bool) system.Report { report := system.Report{} addArchitectureCheck(&report) - if storeErr != nil { + switch { + case storeMissing: + report.AddPass("state store", "will be created on first daemon start at "+d.layout.DBPath) + case storeErr != nil: report.AddFail( "state store", fmt.Sprintf("open %s: %v", d.layout.DBPath, storeErr), "remove or restore the file if corrupt; otherwise check its permissions", ) - } else { + default: report.AddPass("state store", "readable at "+d.layout.DBPath) } diff --git a/internal/daemon/doctor_test.go b/internal/daemon/doctor_test.go index 7544693..dbf0639 100644 --- a/internal/daemon/doctor_test.go +++ b/internal/daemon/doctor_test.go @@ -109,7 +109,7 @@ func findCheck(report system.Report, name string) *system.CheckResult { func TestDoctorReport_StoreErrorSurfacesAsFail(t *testing.T) { d := buildDoctorDaemon(t) - report := d.doctorReport(context.Background(), errors.New("simulated open failure")) + report := d.doctorReport(context.Background(), errors.New("simulated open failure"), false) check := findCheck(report, "state store") if check == nil { @@ -124,9 +124,31 @@ func TestDoctorReport_StoreErrorSurfacesAsFail(t *testing.T) { } } +func TestDoctorReport_StoreMissingSurfacesAsPassForFreshInstall(t *testing.T) { + d := buildDoctorDaemon(t) + // Fresh install: the DB file simply doesn't exist yet. doctor must + // not treat that as a failure — nothing's broken, the first daemon + // start will create the file. The status message should say so, + // so a user running `banger doctor` before ever booting a VM + // doesn't see a scary red check. + report := d.doctorReport(context.Background(), nil, true) + + check := findCheck(report, "state store") + if check == nil { + t.Fatal("state store check missing from report") + } + if check.Status != system.CheckStatusPass { + t.Fatalf("state store status = %q, want pass for a missing DB on fresh install", check.Status) + } + joined := strings.Join(check.Details, " ") + if !strings.Contains(joined, "will be created") { + t.Fatalf("state store details = %q, want mention of 'will be created' so users know this is expected", joined) + } +} + func TestDoctorReport_StoreSuccessSurfacesAsPass(t *testing.T) { d := buildDoctorDaemon(t) - report := d.doctorReport(context.Background(), nil) + report := d.doctorReport(context.Background(), nil, false) check := findCheck(report, "state store") if check == nil { @@ -141,7 +163,7 @@ func TestDoctorReport_MissingFirecrackerFailsHostRuntime(t *testing.T) { d := buildDoctorDaemon(t) d.config.FirecrackerBin = filepath.Join(t.TempDir(), "does-not-exist") - report := d.doctorReport(context.Background(), nil) + report := d.doctorReport(context.Background(), nil, false) check := findCheck(report, "host runtime") if check == nil { t.Fatal("host runtime check missing from report") @@ -153,7 +175,7 @@ func TestDoctorReport_MissingFirecrackerFailsHostRuntime(t *testing.T) { func TestDoctorReport_IncludesEveryDefaultCapability(t *testing.T) { d := buildDoctorDaemon(t) - report := d.doctorReport(context.Background(), nil) + report := d.doctorReport(context.Background(), nil, false) // Every registered capability that implements doctorCapability must // contribute a check. Pre-v0.1 the defaults are work-disk, dns, nat. @@ -173,7 +195,7 @@ func TestDoctorReport_IncludesEveryDefaultCapability(t *testing.T) { func TestDoctorReport_EmitsVMDefaultsProvenance(t *testing.T) { d := buildDoctorDaemon(t) - report := d.doctorReport(context.Background(), nil) + report := d.doctorReport(context.Background(), nil, false) check := findCheck(report, "vm defaults") if check == nil { diff --git a/scripts/smoke.sh b/scripts/smoke.sh new file mode 100755 index 0000000..d0984f6 --- /dev/null +++ b/scripts/smoke.sh @@ -0,0 +1,117 @@ +#!/usr/bin/env bash +# +# scripts/smoke.sh — end-to-end smoke suite for banger. +# +# Drives a real create → start → ssh → exec → delete cycle against +# real Firecracker + real KVM on the host. Intended as a pre-release +# gate: the Go unit + integration tests don't and can't cover the +# post-machine.Start path (socket ownership, guest boot, vsock agent +# wait, guest SSH, workspace prepare). If this suite fails, don't +# ship. +# +# State lives under $BANGER_SMOKE_XDG_DIR (set by `make smoke`, +# defaults to build/smoke/xdg). It's ISOLATED from the invoking +# user's real banger install via XDG_{CONFIG,STATE,CACHE,RUNTIME} +# overrides, but PERSISTED across runs — so the first smoke pulls +# the golden image, subsequent smokes reuse it. `make smoke-clean` +# wipes it. +# +# Invoked via `make smoke`, which sets the three env vars below. +# Don't run this directly unless you know they're set. + +set -euo pipefail + +log() { printf '[smoke] %s\n' "$*" >&2; } +die() { printf '[smoke] FAIL: %s\n' "$*" >&2; exit 1; } + +: "${BANGER_SMOKE_BIN_DIR:?must point at the instrumented binary dir, set by make smoke}" +: "${BANGER_SMOKE_COVER_DIR:?must point at the coverage dir, set by make smoke}" +: "${BANGER_SMOKE_XDG_DIR:?must point at the isolated XDG root, set by make smoke}" + +BANGER="$BANGER_SMOKE_BIN_DIR/banger" +BANGERD="$BANGER_SMOKE_BIN_DIR/bangerd" +VSOCK_AGENT="$BANGER_SMOKE_BIN_DIR/banger-vsock-agent" + +for bin in "$BANGER" "$BANGERD" "$VSOCK_AGENT"; do + [[ -x "$bin" ]] || die "binary missing or not executable: $bin" +done + +# Persistent XDG dirs (state, cache, config) so repeated smoke +# runs reuse the pulled golden image instead of re-downloading +# ~300MB each time. Runtime dir needs to be fresh per-run because +# it holds sockets the daemon cleans up on stop and refuses to +# reuse if any are stale. +mkdir -p \ + "$BANGER_SMOKE_XDG_DIR/config" \ + "$BANGER_SMOKE_XDG_DIR/state" \ + "$BANGER_SMOKE_XDG_DIR/cache" +runtime_dir="$(mktemp -d -t banger-smoke-runtime-XXXXXX)" +# shellcheck disable=SC2064 +trap "rm -rf '$runtime_dir'" EXIT +chmod 0700 "$runtime_dir" + +export XDG_CONFIG_HOME="$BANGER_SMOKE_XDG_DIR/config" +export XDG_STATE_HOME="$BANGER_SMOKE_XDG_DIR/state" +export XDG_CACHE_HOME="$BANGER_SMOKE_XDG_DIR/cache" +export XDG_RUNTIME_DIR="$runtime_dir" + +# Point banger at its companion binaries inside the smoke build. +export BANGER_DAEMON_BIN="$BANGERD" +export BANGER_VSOCK_AGENT_BIN="$VSOCK_AGENT" + +# Instrumented binaries dump coverage here on clean exit. +export GOCOVERDIR="$BANGER_SMOKE_COVER_DIR" +mkdir -p "$GOCOVERDIR" + +# Any smoke daemon left behind from a prior run that crashed mid- +# scenario would reuse the stale socket path and confuse +# ensureDaemon. Best-effort stop; ignore if nothing is running. +"$BANGER" daemon stop >/dev/null 2>&1 || true + +# banger's vmDNS binds 127.0.0.1:42069 (UDP) hard. If the user's +# real (non-smoke) daemon is running, its listener holds the port +# and the smoke daemon's Open() fails before any scenario runs. +# Fail fast with an actionable message — don't guess whether to +# stop the user's daemon for them. +if command -v ss >/dev/null 2>&1 && ss -Huln 2>/dev/null | awk '{print $4}' | grep -q '[:.]42069$'; then + die 'port 127.0.0.1:42069 is already bound (likely your real banger daemon); stop it with `banger daemon stop` and re-run `make smoke`' +fi + +# --- doctor ----------------------------------------------------------- +log 'doctor: checking host readiness' +if ! "$BANGER" doctor; then + die 'doctor reported failures; fix the host before running smoke' +fi + +# --- bare vm run ------------------------------------------------------ +log "bare vm run: create + start + ssh + exec 'echo smoke-bare-ok' + --rm" +bare_out="$("$BANGER" vm run --rm -- echo smoke-bare-ok)" || die "bare vm run exit $?" +grep -q 'smoke-bare-ok' <<<"$bare_out" || die "bare vm run stdout missing marker: $bare_out" + +# --- workspace vm run ------------------------------------------------- +log 'workspace vm run: preparing a throwaway git repo' +repodir="$runtime_dir/fake-repo" +mkdir -p "$repodir" +( + cd "$repodir" + git init -q -b main + git config commit.gpgsign false + git config user.name smoke + git config user.email smoke@smoke + echo 'smoke-workspace-marker' > smoke-file.txt + git add . + git commit -q -m init +) + +log "workspace vm run: create + start + workspace prepare + cat guest file + --rm" +ws_out="$("$BANGER" vm run --rm "$repodir" -- cat /root/repo/smoke-file.txt)" || die "workspace vm run exit $?" +grep -q 'smoke-workspace-marker' <<<"$ws_out" || die "workspace vm run didn't ship smoke-file.txt: $ws_out" + +# --- daemon stop (flushes coverage) ----------------------------------- +log 'stopping daemon so instrumented binaries flush coverage' +"$BANGER" daemon stop >/dev/null 2>&1 || true +# Give the daemon a moment to write its covdata pod before the trap +# tears down runtime_dir. +sleep 0.5 + +log 'all scenarios passed'