make smoke: end-to-end boot suite with coverage from real VM runs

The unit + integration tests can't cross machine.Start — the SDK
boundary would need a fake firecracker that reimplements the
control-plane HTTP API, and the ongoing maintenance cost of keeping
that fake honest with upstream kills the value. Instead, add a
pre-release smoke target that drives REAL Firecracker + real KVM,
captures coverage from the -cover-instrumented binaries, and
surfaces per-package deltas so regressions in the boot path don't
ship silently.

scripts/smoke.sh:
  - Isolated XDG_{CONFIG,STATE,CACHE,RUNTIME} so the smoke run can't
    touch real user state (state/cache persist under build/smoke/xdg
    for fast reruns; runtime is mktemp'd fresh per-run because
    sockets can't be reused)
  - Preflight: `banger doctor` must pass; UDP :42069 must be free
    (otherwise the user's real daemon is up and the smoke daemon
    can't bind its DNS listener — fail with an actionable message)
  - Scenario 1 — bare: `banger vm run --rm -- echo smoke-bare-ok`
    exercises create → start → socket ownership chown → machine.Start
    → SDK waitForSocket race → vsock agent readiness → guest SSH
    wait → exec → cleanup → delete
  - Scenario 2 — workspace: creates a throwaway git repo, runs
    `banger vm run --rm <repo> -- cat /root/repo/smoke-file.txt`,
    verifies the tracked file reached the guest (exercises
    workDisk capability PrepareHost + workspace.prepare)
  - `banger daemon stop` at the end so instrumented binaries flush
    GOCOVERDIR pods before the script exits

Makefile additions:
  - smoke-build: builds banger/bangerd under build/smoke/bin/ with
    `go build -cover`
  - smoke: runs the script with GOCOVERDIR set, reports per-package
    coverage via `go tool covdata percent`
  - smoke-coverage-html: textfmt + go tool cover for a browsable
    report
  - smoke-clean: nukes build/smoke/ including the persisted XDG
    state

Bonus fix uncovered during the first smoke run: doctor treated a
missing state.db as a FAIL ("out of memory" from SQLite
SQLITE_CANTOPEN), which red-flagged every fresh install. Split
the store check: DB file absent → PASS with "will be created on
first daemon start" detail; DB present but unreadable → FAIL as
before. New TestDoctorReport_StoreMissingSurfacesAsPassForFreshInstall
pins the behaviour.

Concrete coverage delta from the first successful smoke run
(compared to `make coverage-total`'s unit-test-only 37.8%):

  internal/firecracker        43.6% → 75.0%
  internal/daemon/workspace   33.8% → 60.8%
  internal/store              40.1% → 56.3%
  internal/guest              63.7% → 57.4%  (different mix: smoke
                                              exercises real SSH;
                                              unit tests cover more
                                              error branches)

The packages the review flagged are the ones that moved most —
which is the point.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Thales Maciel 2026-04-22 18:59:57 -03:00
parent 52612b516b
commit 5f81332b0a
No known key found for this signature in database
GPG key ID: 33112E6833C34679
5 changed files with 221 additions and 18 deletions

View file

@ -26,35 +26,52 @@ func Doctor(ctx context.Context) (system.Report, error) {
}
// Doctor must be read-only: running it should never mutate the
// state DB (no migrations, no WAL checkpoint, no pragma writes).
// If the DB is missing or unreadable the storeErr path surfaces
// it as a failing check rather than half-opening a writable
// handle.
db, storeErr := store.OpenReadOnly(layout.DBPath)
// Skip OpenReadOnly entirely when the DB file doesn't exist —
// that's a fresh install, not an error condition. The first
// daemon start will create the file. storeMissing differentiates
// "no DB yet" (pass) from "DB present but unreadable" (fail) in
// the report.
d := &Daemon{
layout: layout,
config: cfg,
runner: system.NewRunner(),
}
if storeErr == nil {
defer db.Close()
d.store = db
var storeErr error
storeMissing := false
if _, statErr := os.Stat(layout.DBPath); statErr != nil {
if os.IsNotExist(statErr) {
storeMissing = true
} else {
storeErr = statErr
}
} else {
db, err := store.OpenReadOnly(layout.DBPath)
if err != nil {
storeErr = err
} else {
defer db.Close()
d.store = db
}
}
wireServices(d)
return d.doctorReport(ctx, storeErr), nil
return d.doctorReport(ctx, storeErr, storeMissing), nil
}
func (d *Daemon) doctorReport(ctx context.Context, storeErr error) system.Report {
func (d *Daemon) doctorReport(ctx context.Context, storeErr error, storeMissing bool) system.Report {
report := system.Report{}
addArchitectureCheck(&report)
if storeErr != nil {
switch {
case storeMissing:
report.AddPass("state store", "will be created on first daemon start at "+d.layout.DBPath)
case storeErr != nil:
report.AddFail(
"state store",
fmt.Sprintf("open %s: %v", d.layout.DBPath, storeErr),
"remove or restore the file if corrupt; otherwise check its permissions",
)
} else {
default:
report.AddPass("state store", "readable at "+d.layout.DBPath)
}

View file

@ -109,7 +109,7 @@ func findCheck(report system.Report, name string) *system.CheckResult {
func TestDoctorReport_StoreErrorSurfacesAsFail(t *testing.T) {
d := buildDoctorDaemon(t)
report := d.doctorReport(context.Background(), errors.New("simulated open failure"))
report := d.doctorReport(context.Background(), errors.New("simulated open failure"), false)
check := findCheck(report, "state store")
if check == nil {
@ -124,9 +124,31 @@ func TestDoctorReport_StoreErrorSurfacesAsFail(t *testing.T) {
}
}
func TestDoctorReport_StoreMissingSurfacesAsPassForFreshInstall(t *testing.T) {
d := buildDoctorDaemon(t)
// Fresh install: the DB file simply doesn't exist yet. doctor must
// not treat that as a failure — nothing's broken, the first daemon
// start will create the file. The status message should say so,
// so a user running `banger doctor` before ever booting a VM
// doesn't see a scary red check.
report := d.doctorReport(context.Background(), nil, true)
check := findCheck(report, "state store")
if check == nil {
t.Fatal("state store check missing from report")
}
if check.Status != system.CheckStatusPass {
t.Fatalf("state store status = %q, want pass for a missing DB on fresh install", check.Status)
}
joined := strings.Join(check.Details, " ")
if !strings.Contains(joined, "will be created") {
t.Fatalf("state store details = %q, want mention of 'will be created' so users know this is expected", joined)
}
}
func TestDoctorReport_StoreSuccessSurfacesAsPass(t *testing.T) {
d := buildDoctorDaemon(t)
report := d.doctorReport(context.Background(), nil)
report := d.doctorReport(context.Background(), nil, false)
check := findCheck(report, "state store")
if check == nil {
@ -141,7 +163,7 @@ func TestDoctorReport_MissingFirecrackerFailsHostRuntime(t *testing.T) {
d := buildDoctorDaemon(t)
d.config.FirecrackerBin = filepath.Join(t.TempDir(), "does-not-exist")
report := d.doctorReport(context.Background(), nil)
report := d.doctorReport(context.Background(), nil, false)
check := findCheck(report, "host runtime")
if check == nil {
t.Fatal("host runtime check missing from report")
@ -153,7 +175,7 @@ func TestDoctorReport_MissingFirecrackerFailsHostRuntime(t *testing.T) {
func TestDoctorReport_IncludesEveryDefaultCapability(t *testing.T) {
d := buildDoctorDaemon(t)
report := d.doctorReport(context.Background(), nil)
report := d.doctorReport(context.Background(), nil, false)
// Every registered capability that implements doctorCapability must
// contribute a check. Pre-v0.1 the defaults are work-disk, dns, nat.
@ -173,7 +195,7 @@ func TestDoctorReport_IncludesEveryDefaultCapability(t *testing.T) {
func TestDoctorReport_EmitsVMDefaultsProvenance(t *testing.T) {
d := buildDoctorDaemon(t)
report := d.doctorReport(context.Background(), nil)
report := d.doctorReport(context.Background(), nil, false)
check := findCheck(report, "vm defaults")
if check == nil {