banger/internal/daemon/doctor.go
Thales Maciel 1c1ca7d6a4
doctor: pin firecracker version range, distro-aware install hint
Pre-release polish: be explicit about which firecracker versions
banger has been validated against, and give users a one-line install
suggestion when the binary is missing rather than the previous
generic "install firecracker or set firecracker_bin".

internal/firecracker/version.go (new):
  * MinSupportedVersion = "1.5.0" — the floor banger refuses to
    launch below. Bumping this is a deliberate decision, paired
    with whatever helper feature started requiring the newer
    firecracker.
  * KnownTestedVersion = "1.14.1" — what banger's smoke suite
    actually runs against today.
  * SemVer + Compare + ParseVersionOutput, table-tested. The parser
    tolerates the trailing "exiting successfully" log line that
    firecracker tacks onto --version; only the canonical
    "Firecracker vX.Y.Z" line matters.
  * QueryVersion shells `<bin> --version` through a CommandRunner-
    shaped interface; doesn't import internal/system to keep the
    firecracker package leaf-clean.

internal/daemon/doctor.go:
  * New addFirecrackerVersionCheck replaces the previous bare
    RequireExecutable preflight for firecracker. Three outcomes:
    PASS within [Min, Tested], WARN above Tested (newer firecracker
    usually works but is outside the tested window), FAIL below Min
    or when the binary is missing.
  * On missing binary, surfaces a distro-aware install command via
    parseOSReleaseIDs(/etc/os-release) → guessFirecrackerInstall
    Command. Pinned suggestions for debian (apt), arch/manjaro
    (paru), and nixos (nix-env). Other distros get only the upstream
    Releases URL — guessing wrong sends users on a wild goose chase.
  * runtimeChecks no longer includes the firecracker preflight; the
    new check subsumes it.

README.md:
  * Requirements line now spells out the tested-against version
    (v1.14.1) and the supported floor (≥ v1.5.0), and points at
    `banger doctor` for the version check + install hint.

Tests: ParseVersionOutput across canonical/prerelease/garbage inputs,
SemVer.Compare across major/minor/patch boundaries, MustParseSemVer
panics on malformed inputs. Doctor-side: PASS on tested version,
FAIL below Min, WARN above Tested, FAIL with upstream URL when
missing, install-hint dispatch table covering debian/ubuntu (via
ID_LIKE)/arch/manjaro/nixos/fedora-fallback/missing-os-release.
The renamed TestDoctorReport_MissingFirecrackerFails... now asserts
against the new check name. Live `banger doctor` reports
"v1.14.1 at /usr/bin/firecracker (within tested range; min v1.5.0,
tested v1.14.1)" against the smoke host.

Smoke bare_run still green.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-28 17:47:42 -03:00

542 lines
20 KiB
Go

package daemon
import (
"context"
"fmt"
"os"
"path/filepath"
"runtime"
"strings"
"syscall"
"banger/internal/config"
"banger/internal/firecracker"
"banger/internal/imagecat"
"banger/internal/installmeta"
"banger/internal/model"
"banger/internal/paths"
"banger/internal/store"
"banger/internal/system"
)
// systemdSystemDir is the path systemd reads enabled units from. Pulled
// out as a var (not a const) so the security-posture tests can swap it
// for a tempdir without faking /etc/systemd/system on the test host.
var systemdSystemDir = "/etc/systemd/system"
func Doctor(ctx context.Context) (system.Report, error) {
userLayout, err := paths.Resolve()
if err != nil {
return system.Report{}, err
}
cfg, err := config.Load(userLayout)
if err != nil {
return system.Report{}, err
}
layout := paths.ResolveSystem()
// Doctor must be read-only: running it should never mutate the
// state DB (no migrations, no WAL checkpoint, no pragma writes).
// Skip OpenReadOnly entirely when the DB file doesn't exist —
// that's a fresh install, not an error condition. The first
// daemon start will create the file. storeMissing differentiates
// "no DB yet" (pass) from "DB present but unreadable" (fail) in
// the report.
d := &Daemon{
layout: layout,
userLayout: userLayout,
config: cfg,
runner: system.NewRunner(),
}
var storeErr error
storeMissing := false
if _, statErr := os.Stat(layout.DBPath); statErr != nil {
if os.IsNotExist(statErr) {
storeMissing = true
} else {
storeErr = statErr
}
} else {
db, err := store.OpenReadOnly(layout.DBPath)
if err != nil {
storeErr = err
} else {
defer db.Close()
d.store = db
}
}
wireServices(d)
return d.doctorReport(ctx, storeErr, storeMissing), nil
}
func (d *Daemon) doctorReport(ctx context.Context, storeErr error, storeMissing bool) system.Report {
report := system.Report{}
addArchitectureCheck(&report)
switch {
case storeMissing:
report.AddPass("state store", "will be created on first daemon start at "+d.layout.DBPath)
case storeErr != nil:
report.AddFail(
"state store",
fmt.Sprintf("open %s: %v", d.layout.DBPath, storeErr),
"remove or restore the file if corrupt; otherwise check its permissions",
)
default:
report.AddPass("state store", "readable at "+d.layout.DBPath)
}
report.AddPreflight("host runtime", d.runtimeChecks(), runtimeStatus(d.config))
report.AddPreflight("core vm lifecycle", d.coreVMLifecycleChecks(), "required host tools available")
report.AddPreflight("vsock guest agent", d.vsockChecks(), "vsock guest agent prerequisites available")
d.addVMDefaultsCheck(&report)
d.addSSHShortcutCheck(&report)
d.addCapabilityDoctorChecks(ctx, &report)
d.addFirecrackerVersionCheck(ctx, &report)
d.addSecurityPostureChecks(ctx, &report)
return report
}
// addFirecrackerVersionCheck verifies the configured firecracker
// binary exists, is recent enough for banger's expectations
// (firecracker.MinSupportedVersion), and surfaces a distro-aware
// install hint if it's missing. Three outcomes:
//
// - present + version in [Min, Tested]: PASS.
// - present + version above Tested: WARN. Newer firecracker
// usually works (the API is stable within a major), but it's
// outside banger's tested window.
// - present + version below Min: FAIL with the upgrade hint.
// - missing entirely: FAIL with a guess at the user's package
// manager plus the upstream Releases URL.
//
// We intentionally don't use the generic RequireExecutable preflight
// for this check — its static hint string can't carry the distro
// dispatch.
func (d *Daemon) addFirecrackerVersionCheck(ctx context.Context, report *system.Report) {
binPath := strings.TrimSpace(d.config.FirecrackerBin)
if binPath == "" {
binPath = "firecracker"
}
resolved, err := system.LookupExecutable(binPath)
if err != nil {
details := []string{fmt.Sprintf("not found: %s", binPath)}
details = append(details, firecrackerInstallHint(osReleaseSource)...)
report.AddFail("firecracker binary", details...)
return
}
parsed, err := firecracker.QueryVersion(ctx, d.runner, resolved)
if err != nil {
report.AddFail("firecracker binary",
fmt.Sprintf("`%s --version` failed: %v", resolved, err),
"reinstall firecracker; see https://github.com/firecracker-microvm/firecracker/releases")
return
}
reported := parsed.String()
min := firecracker.MustParseSemVer(firecracker.MinSupportedVersion)
tested := firecracker.MustParseSemVer(firecracker.KnownTestedVersion)
switch {
case parsed.Compare(min) < 0:
report.AddFail("firecracker binary",
fmt.Sprintf("%s at %s; banger requires ≥ v%s", reported, resolved, firecracker.MinSupportedVersion),
"upgrade firecracker — see https://github.com/firecracker-microvm/firecracker/releases")
case parsed.Compare(tested) > 0:
report.AddWarn("firecracker binary",
fmt.Sprintf("%s at %s (newer than banger's tested v%s; usually works)", reported, resolved, firecracker.KnownTestedVersion))
default:
report.AddPass("firecracker binary",
fmt.Sprintf("%s at %s (within tested range; min v%s, tested v%s)",
reported, resolved, firecracker.MinSupportedVersion, firecracker.KnownTestedVersion))
}
}
// osReleaseSource is the file the install-hint reads to detect the
// host distro. Var rather than const so doctor tests can swap in a
// fixture.
var osReleaseSource = "/etc/os-release"
// firecrackerInstallHint returns 1-2 detail lines describing how to
// install firecracker on the current host: a one-line guess based on
// /etc/os-release when the distro is recognised, plus the upstream
// Releases URL as a universal fallback. Anything we can't recognise
// gets only the URL — better silence than wrong instructions.
func firecrackerInstallHint(osReleasePath string) []string {
hints := []string{}
if cmd := guessFirecrackerInstallCommand(osReleasePath); cmd != "" {
hints = append(hints, "install: "+cmd)
}
hints = append(hints, "or download a static binary from https://github.com/firecracker-microvm/firecracker/releases")
return hints
}
// guessFirecrackerInstallCommand reads osReleasePath and returns a
// short, copy-pasteable install command for the detected distro, or
// "" when no reliable mapping applies. We only suggest commands for
// distros where firecracker is actually packaged — guessing wrong
// here would send users on a wild goose chase.
func guessFirecrackerInstallCommand(osReleasePath string) string {
data, err := os.ReadFile(osReleasePath)
if err != nil {
return ""
}
id, idLike := parseOSReleaseIDs(string(data))
candidates := append([]string{id}, strings.Fields(idLike)...)
for _, c := range candidates {
switch c {
case "debian":
// Packaged in Debian since trixie / bookworm-backports.
return "sudo apt install firecracker"
case "arch", "manjaro", "endeavouros":
// AUR; we don't assume a specific helper, but `paru` is the
// common one. Users who prefer yay/makepkg/etc. will
// substitute mentally.
return "paru -S firecracker # or your preferred AUR helper"
case "nixos":
return "nix-env -iA nixos.firecracker # or add to your configuration.nix"
}
}
return ""
}
// parseOSReleaseIDs extracts the ID and ID_LIKE values from an
// /etc/os-release blob. Both are returned with surrounding quotes
// stripped; missing keys return empty strings. We don't validate
// the format beyond `KEY=value` — os-release is a simple format and
// any drift would manifest as a quiet "no distro hint" rather than
// a false positive.
func parseOSReleaseIDs(content string) (id, idLike string) {
for _, line := range strings.Split(content, "\n") {
line = strings.TrimSpace(line)
if rest, ok := strings.CutPrefix(line, "ID="); ok {
id = strings.Trim(rest, `"`)
}
if rest, ok := strings.CutPrefix(line, "ID_LIKE="); ok {
idLike = strings.Trim(rest, `"`)
}
}
return id, idLike
}
// addSecurityPostureChecks verifies the install matches what
// docs/privileges.md describes: helper + owner-daemon units active,
// sockets at the expected mode/owner, unit files carrying the
// hardening directives, and the firecracker binary owned by root +
// non-writable. Drift between the doc and the running install would
// silently weaken the trust model; surfacing it here makes the doc
// load-bearing rather than aspirational.
//
// In non-system mode (no /etc/banger/install.toml) emits a single
// warn pointing at the docs section that explains the looser dev-mode
// trust model — a doctor PASS row in that mode would imply guarantees
// the install isn't actually providing.
func (d *Daemon) addSecurityPostureChecks(ctx context.Context, report *system.Report) {
d.addSecurityPostureChecksAt(ctx, report, installmeta.DefaultPath, systemdSystemDir)
}
// addSecurityPostureChecksAt is the seam tests use: pass a fake
// install.toml + systemd dir to exercise the system-mode branch
// without writing to /etc.
func (d *Daemon) addSecurityPostureChecksAt(ctx context.Context, report *system.Report, installPath, systemdDir string) {
meta, err := installmeta.Load(installPath)
if err != nil {
report.AddWarn("security posture",
"running outside the system install (no "+installPath+")",
"helper SO_PEERCRED, narrow CapabilityBoundingSet, NoNewPrivileges, and ProtectSystem=strict are bypassed in this mode",
"see docs/privileges.md > 'Running outside the system install'; install via `sudo banger system install --owner $USER` for the supported trust model")
return
}
addServiceActiveCheck(ctx, d.runner, report, "helper service", installmeta.DefaultRootHelperService)
addServiceActiveCheck(ctx, d.runner, report, "owner daemon service", installmeta.DefaultService)
addSocketPermsCheck(report, "helper socket", installmeta.DefaultRootHelperSocketPath, meta.OwnerUID, 0o600)
addSocketPermsCheck(report, "daemon socket", installmeta.DefaultSocketPath, meta.OwnerUID, 0o600)
addUnitHardeningCheck(report, "helper unit hardening",
filepath.Join(systemdDir, installmeta.DefaultRootHelperService),
[]string{
"NoNewPrivileges=yes",
"ProtectSystem=strict",
"ProtectHome=yes",
"RestrictSUIDSGID=yes",
"LockPersonality=yes",
"CapabilityBoundingSet=",
})
addUnitHardeningCheck(report, "daemon unit hardening",
filepath.Join(systemdDir, installmeta.DefaultService),
[]string{
"User=" + meta.OwnerUser,
"NoNewPrivileges=yes",
"ProtectSystem=strict",
"ProtectHome=read-only",
"RestrictSUIDSGID=yes",
"LockPersonality=yes",
})
addExecutableOwnershipCheck(report, "firecracker binary ownership", d.config.FirecrackerBin)
}
// addServiceActiveCheck shells `systemctl is-active <svc>` and surfaces
// the result. is-active exits non-zero for inactive/failed states but
// always prints the state on stdout, so we read the trimmed output and
// ignore the exit code. Anything other than "active" is a fail with a
// systemctl-restart hint.
func addServiceActiveCheck(ctx context.Context, runner system.CommandRunner, report *system.Report, name, service string) {
out, _ := runner.Run(ctx, "systemctl", "is-active", service)
state := strings.TrimSpace(string(out))
if state == "" {
state = "unknown"
}
if state == "active" {
report.AddPass(name, fmt.Sprintf("%s is active", service))
return
}
report.AddFail(name,
fmt.Sprintf("%s is %s, not active", service, state),
fmt.Sprintf("run `sudo systemctl restart %s` and re-run `banger doctor`", service))
}
// addSocketPermsCheck stat()s the socket path and compares mode +
// owner against the values the install promises. Both daemon and
// helper sockets are 0600 chowned to the registered owner UID; any
// drift means filesystem perms aren't gating access the way the docs
// describe.
func addSocketPermsCheck(report *system.Report, name, path string, expectedUID int, expectedMode os.FileMode) {
info, err := os.Stat(path)
if err != nil {
report.AddFail(name,
fmt.Sprintf("%s: %v", path, err),
"is the service running? `sudo systemctl status` and check the runtime dir")
return
}
stat, ok := info.Sys().(*syscall.Stat_t)
if !ok {
report.AddWarn(name, fmt.Sprintf("%s: cannot read ownership metadata on this platform", path))
return
}
actualMode := info.Mode().Perm()
var problems []string
if actualMode != expectedMode {
problems = append(problems, fmt.Sprintf("mode is %#o, want %#o", actualMode, expectedMode))
}
if int(stat.Uid) != expectedUID {
problems = append(problems, fmt.Sprintf("uid is %d, want %d", stat.Uid, expectedUID))
}
if len(problems) > 0 {
problems = append(problems, "restart the service so the socket gets recreated with correct perms")
report.AddFail(name, fmt.Sprintf("%s: %s", path, strings.Join(problems, "; ")))
return
}
report.AddPass(name, fmt.Sprintf("%s: mode %#o, uid %d", path, actualMode, expectedUID))
}
// addUnitHardeningCheck reads the systemd unit file and confirms
// every required directive is present as a literal substring. Brittle
// to formatting changes (a comment-out would slip through), but
// strong enough to catch the "someone hand-edited the unit and
// dropped NoNewPrivileges" failure mode that motivates this check.
// The directives list captures the security-relevant subset of the
// renderer in commands_system.go; everything else (Description,
// ExecStart, etc.) is operational and not worth pinning here.
func addUnitHardeningCheck(report *system.Report, name, path string, required []string) {
data, err := os.ReadFile(path)
if err != nil {
report.AddFail(name,
fmt.Sprintf("%s: %v", path, err),
"reinstall via `sudo banger system install` to refresh the unit")
return
}
content := string(data)
var missing []string
for _, directive := range required {
if !strings.Contains(content, directive) {
missing = append(missing, directive)
}
}
if len(missing) > 0 {
report.AddFail(name,
fmt.Sprintf("%s missing directives: %s", path, strings.Join(missing, ", ")),
"reinstall via `sudo banger system install` to refresh the unit")
return
}
report.AddPass(name, fmt.Sprintf("%s: %d hardening directives present", path, len(required)))
}
// addExecutableOwnershipCheck mirrors validateRootExecutable's runtime
// check at doctor time: regular file, root-owned, executable, not
// group/world writable, not a symlink. Doctor catching this once at
// install time beats the helper failing every launch with a less
// helpful message.
func addExecutableOwnershipCheck(report *system.Report, name, path string) {
if strings.TrimSpace(path) == "" {
report.AddWarn(name, "no firecracker binary path configured")
return
}
info, err := os.Lstat(path)
if err != nil {
report.AddFail(name, fmt.Sprintf("%s: %v", path, err))
return
}
if info.Mode()&os.ModeSymlink != 0 {
report.AddFail(name,
fmt.Sprintf("%s is a symlink", path),
"the helper opens the binary with O_NOFOLLOW; resolve the symlink and update firecracker_bin in the daemon config")
return
}
if !info.Mode().IsRegular() {
report.AddFail(name, fmt.Sprintf("%s is not a regular file", path))
return
}
mode := info.Mode().Perm()
if mode&0o111 == 0 {
report.AddFail(name,
fmt.Sprintf("%s mode %#o is not executable", path, mode),
"chmod +x the binary")
return
}
if mode&0o022 != 0 {
report.AddFail(name,
fmt.Sprintf("%s mode %#o is group/world writable", path, mode),
"chmod g-w,o-w the binary so the helper accepts it")
return
}
stat, ok := info.Sys().(*syscall.Stat_t)
if !ok {
report.AddWarn(name, fmt.Sprintf("%s: cannot read ownership metadata on this platform", path))
return
}
if stat.Uid != 0 {
report.AddFail(name,
fmt.Sprintf("%s is owned by uid %d, want 0", path, stat.Uid),
"`sudo chown root` the firecracker binary")
return
}
report.AddPass(name, fmt.Sprintf("%s: regular, root-owned, mode %#o", path, mode))
}
// addSSHShortcutCheck surfaces a gentle warning when banger maintains
// an ssh_config file but the user hasn't wired it into ~/.ssh/config.
// This is intentionally a warn, not a fail — the shortcut is opt-in
// convenience and `banger vm ssh` works either way.
func (d *Daemon) addSSHShortcutCheck(report *system.Report) {
bangerConfig := BangerSSHConfigPath(d.userLayout)
if strings.TrimSpace(bangerConfig) == "" {
return
}
if _, err := os.Stat(bangerConfig); err != nil {
// No banger ssh_config rendered yet — nothing to include.
return
}
installed, err := UserSSHIncludeInstalled()
if err != nil {
report.AddWarn("ssh shortcut", fmt.Sprintf("could not read ~/.ssh/config: %v", err))
return
}
if installed {
report.AddPass("ssh shortcut", "enabled — `ssh <name>.vm` routes through banger")
return
}
report.AddWarn(
"ssh shortcut",
fmt.Sprintf("`ssh <name>.vm` not enabled (opt-in); run `banger ssh-config --install` or add `Include %s` to ~/.ssh/config", bangerConfig),
)
}
// addArchitectureCheck surfaces a hard-fail when banger is running on
// a non-amd64 host. Companion binaries are pinned to amd64 in the
// Makefile, the published kernel catalog ships only x86_64 images, and
// OCI import pulls linux/amd64 layers. Letting users discover this
// through cryptic downstream failures is worse than saying it up front.
func addArchitectureCheck(report *system.Report) {
if runtime.GOARCH == "amd64" {
report.AddPass("host architecture", "amd64")
return
}
report.AddFail(
"host architecture",
fmt.Sprintf("running on %s; banger today only supports amd64/x86_64 hosts", runtime.GOARCH),
"companion build, kernel catalog, and OCI import all assume linux/amd64",
)
}
// addVMDefaultsCheck surfaces the effective VM sizing that `vm run` /
// `vm create` will apply when the user omits the flags. Shown as a
// PASS check so it always renders, with per-field provenance
// (config|auto|builtin) so users can tell what's driving each number.
func (d *Daemon) addVMDefaultsCheck(report *system.Report) {
host, err := system.ReadHostResources()
var cpus int
var memBytes int64
if err == nil {
cpus = host.CPUCount
memBytes = host.TotalMemoryBytes
}
defaults := model.ResolveVMDefaults(d.config.VMDefaults, cpus, memBytes)
details := []string{
fmt.Sprintf("vcpu: %d (%s)", defaults.VCPUCount, defaults.VCPUSource),
fmt.Sprintf("memory: %d MiB (%s)", defaults.MemoryMiB, defaults.MemorySource),
fmt.Sprintf("disk: %s (%s)", model.FormatSizeBytes(defaults.WorkDiskSizeBytes), defaults.WorkDiskSource),
"override any of these in ~/.config/banger/config.toml under [vm_defaults]",
}
report.AddPass("vm defaults", details...)
}
func (d *Daemon) runtimeChecks() *system.Preflight {
checks := system.NewPreflight()
// Firecracker presence + version is a separate top-level check (see
// addFirecrackerVersionCheck) so the report can carry a distro-aware
// install hint when the binary is missing — RequireExecutable's
// static `hint` string can't do that.
checks.RequireFile(d.config.SSHKeyPath, "ssh private key", `set "ssh_key_path" or let banger create its default key`)
if helper, err := vsockAgentBinary(d.layout); err == nil {
checks.RequireExecutable(helper, "vsock agent helper", `run 'make build' or reinstall banger`)
} else {
checks.Addf("%v", err)
}
if d.store != nil && strings.TrimSpace(d.config.DefaultImageName) != "" {
name := d.config.DefaultImageName
image, err := d.store.GetImageByName(context.Background(), name)
if err == nil {
checks.RequireFile(image.RootfsPath, "default image rootfs", `re-register or rebuild the default image`)
checks.RequireFile(image.KernelPath, "default image kernel", `re-register or rebuild the default image`)
if strings.TrimSpace(image.InitrdPath) != "" {
checks.RequireFile(image.InitrdPath, "default image initrd", `re-register or rebuild the default image`)
}
} else if !defaultImageInCatalog(name) {
checks.Addf("default image %q is not registered and not in the imagecat catalog", name)
}
// If the default image isn't local but is cataloged, vm create
// will auto-pull it on first use — no error to surface.
}
return checks
}
func defaultImageInCatalog(name string) bool {
catalog, err := imagecat.LoadEmbedded()
if err != nil {
return false
}
_, err = catalog.Lookup(name)
return err == nil
}
func (d *Daemon) coreVMLifecycleChecks() *system.Preflight {
checks := system.NewPreflight()
d.vm.addBaseStartCommandPrereqs(checks)
return checks
}
func (d *Daemon) vsockChecks() *system.Preflight {
checks := system.NewPreflight()
if helper, err := vsockAgentBinary(d.layout); err == nil {
checks.RequireExecutable(helper, "vsock agent helper", `run 'make build' or reinstall banger`)
} else {
checks.Addf("%v", err)
}
checks.RequireFile(d.vm.vsockHostDevice, "vsock host device", "load the vhost_vsock kernel module on the host")
return checks
}
func runtimeStatus(cfg model.DaemonConfig) string {
if strings.TrimSpace(cfg.FirecrackerBin) == "" {
return "firecracker not configured"
}
return "firecracker and ssh key resolved"
}