`docs/privileges.md` now documents what the install promises (helper +
daemon services active, sockets at 0600 ownerUID, units carrying the
hardening directives, firecracker root-owned + non-writable). Doctor
verifies the running install matches: drift between the doc and the
filesystem would silently weaken the trust model otherwise.
In system mode (install.toml present):
* helper service / owner daemon service: `systemctl is-active`.
* helper socket / daemon socket: stat-and-compare mode + uid against
the registered owner.
* helper unit hardening / daemon unit hardening: scan the rendered
unit for NoNewPrivileges, ProtectSystem=strict, ProtectHome
(=yes for the helper, =read-only for the daemon), RestrictSUIDSGID,
LockPersonality, and the helper's CapabilityBoundingSet line. The
daemon unit also pins User=<registered owner>.
* firecracker binary ownership: regular file, not a symlink, mode
not group/world writable, executable, owned by uid 0 — same
constraints validateRootExecutable enforces at launch, surfaced
once at doctor time so a misconfigured binary fails fast with a
clearer error than the helper's open-time rejection.
In non-system mode (no /etc/banger/install.toml) doctor emits a single
WARN row pointing at docs/privileges.md > 'Running outside the system
install'. A PASS would imply guarantees the install isn't actually
providing.
Tests cover both branches: the non-system warn pins its message
substrings; system-mode pins that every check name shows up; and the
helpers (socket-perms, unit-hardening, executable-ownership) have
direct table-style negative tests.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
417 lines
15 KiB
Go
417 lines
15 KiB
Go
package daemon
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"runtime"
|
|
"strings"
|
|
"syscall"
|
|
|
|
"banger/internal/config"
|
|
"banger/internal/imagecat"
|
|
"banger/internal/installmeta"
|
|
"banger/internal/model"
|
|
"banger/internal/paths"
|
|
"banger/internal/store"
|
|
"banger/internal/system"
|
|
)
|
|
|
|
// systemdSystemDir is the path systemd reads enabled units from. Pulled
|
|
// out as a var (not a const) so the security-posture tests can swap it
|
|
// for a tempdir without faking /etc/systemd/system on the test host.
|
|
var systemdSystemDir = "/etc/systemd/system"
|
|
|
|
func Doctor(ctx context.Context) (system.Report, error) {
|
|
userLayout, err := paths.Resolve()
|
|
if err != nil {
|
|
return system.Report{}, err
|
|
}
|
|
cfg, err := config.Load(userLayout)
|
|
if err != nil {
|
|
return system.Report{}, err
|
|
}
|
|
layout := paths.ResolveSystem()
|
|
// Doctor must be read-only: running it should never mutate the
|
|
// state DB (no migrations, no WAL checkpoint, no pragma writes).
|
|
// Skip OpenReadOnly entirely when the DB file doesn't exist —
|
|
// that's a fresh install, not an error condition. The first
|
|
// daemon start will create the file. storeMissing differentiates
|
|
// "no DB yet" (pass) from "DB present but unreadable" (fail) in
|
|
// the report.
|
|
d := &Daemon{
|
|
layout: layout,
|
|
userLayout: userLayout,
|
|
config: cfg,
|
|
runner: system.NewRunner(),
|
|
}
|
|
var storeErr error
|
|
storeMissing := false
|
|
if _, statErr := os.Stat(layout.DBPath); statErr != nil {
|
|
if os.IsNotExist(statErr) {
|
|
storeMissing = true
|
|
} else {
|
|
storeErr = statErr
|
|
}
|
|
} else {
|
|
db, err := store.OpenReadOnly(layout.DBPath)
|
|
if err != nil {
|
|
storeErr = err
|
|
} else {
|
|
defer db.Close()
|
|
d.store = db
|
|
}
|
|
}
|
|
wireServices(d)
|
|
return d.doctorReport(ctx, storeErr, storeMissing), nil
|
|
}
|
|
|
|
func (d *Daemon) doctorReport(ctx context.Context, storeErr error, storeMissing bool) system.Report {
|
|
report := system.Report{}
|
|
|
|
addArchitectureCheck(&report)
|
|
|
|
switch {
|
|
case storeMissing:
|
|
report.AddPass("state store", "will be created on first daemon start at "+d.layout.DBPath)
|
|
case storeErr != nil:
|
|
report.AddFail(
|
|
"state store",
|
|
fmt.Sprintf("open %s: %v", d.layout.DBPath, storeErr),
|
|
"remove or restore the file if corrupt; otherwise check its permissions",
|
|
)
|
|
default:
|
|
report.AddPass("state store", "readable at "+d.layout.DBPath)
|
|
}
|
|
|
|
report.AddPreflight("host runtime", d.runtimeChecks(), runtimeStatus(d.config))
|
|
report.AddPreflight("core vm lifecycle", d.coreVMLifecycleChecks(), "required host tools available")
|
|
report.AddPreflight("vsock guest agent", d.vsockChecks(), "vsock guest agent prerequisites available")
|
|
d.addVMDefaultsCheck(&report)
|
|
d.addSSHShortcutCheck(&report)
|
|
d.addCapabilityDoctorChecks(ctx, &report)
|
|
d.addSecurityPostureChecks(ctx, &report)
|
|
|
|
return report
|
|
}
|
|
|
|
// addSecurityPostureChecks verifies the install matches what
|
|
// docs/privileges.md describes: helper + owner-daemon units active,
|
|
// sockets at the expected mode/owner, unit files carrying the
|
|
// hardening directives, and the firecracker binary owned by root +
|
|
// non-writable. Drift between the doc and the running install would
|
|
// silently weaken the trust model; surfacing it here makes the doc
|
|
// load-bearing rather than aspirational.
|
|
//
|
|
// In non-system mode (no /etc/banger/install.toml) emits a single
|
|
// warn pointing at the docs section that explains the looser dev-mode
|
|
// trust model — a doctor PASS row in that mode would imply guarantees
|
|
// the install isn't actually providing.
|
|
func (d *Daemon) addSecurityPostureChecks(ctx context.Context, report *system.Report) {
|
|
d.addSecurityPostureChecksAt(ctx, report, installmeta.DefaultPath, systemdSystemDir)
|
|
}
|
|
|
|
// addSecurityPostureChecksAt is the seam tests use: pass a fake
|
|
// install.toml + systemd dir to exercise the system-mode branch
|
|
// without writing to /etc.
|
|
func (d *Daemon) addSecurityPostureChecksAt(ctx context.Context, report *system.Report, installPath, systemdDir string) {
|
|
meta, err := installmeta.Load(installPath)
|
|
if err != nil {
|
|
report.AddWarn("security posture",
|
|
"running outside the system install (no "+installPath+")",
|
|
"helper SO_PEERCRED, narrow CapabilityBoundingSet, NoNewPrivileges, and ProtectSystem=strict are bypassed in this mode",
|
|
"see docs/privileges.md > 'Running outside the system install'; install via `sudo banger system install --owner $USER` for the supported trust model")
|
|
return
|
|
}
|
|
addServiceActiveCheck(ctx, d.runner, report, "helper service", installmeta.DefaultRootHelperService)
|
|
addServiceActiveCheck(ctx, d.runner, report, "owner daemon service", installmeta.DefaultService)
|
|
addSocketPermsCheck(report, "helper socket", installmeta.DefaultRootHelperSocketPath, meta.OwnerUID, 0o600)
|
|
addSocketPermsCheck(report, "daemon socket", installmeta.DefaultSocketPath, meta.OwnerUID, 0o600)
|
|
addUnitHardeningCheck(report, "helper unit hardening",
|
|
filepath.Join(systemdDir, installmeta.DefaultRootHelperService),
|
|
[]string{
|
|
"NoNewPrivileges=yes",
|
|
"ProtectSystem=strict",
|
|
"ProtectHome=yes",
|
|
"RestrictSUIDSGID=yes",
|
|
"LockPersonality=yes",
|
|
"CapabilityBoundingSet=",
|
|
})
|
|
addUnitHardeningCheck(report, "daemon unit hardening",
|
|
filepath.Join(systemdDir, installmeta.DefaultService),
|
|
[]string{
|
|
"User=" + meta.OwnerUser,
|
|
"NoNewPrivileges=yes",
|
|
"ProtectSystem=strict",
|
|
"ProtectHome=read-only",
|
|
"RestrictSUIDSGID=yes",
|
|
"LockPersonality=yes",
|
|
})
|
|
addExecutableOwnershipCheck(report, "firecracker binary ownership", d.config.FirecrackerBin)
|
|
}
|
|
|
|
// addServiceActiveCheck shells `systemctl is-active <svc>` and surfaces
|
|
// the result. is-active exits non-zero for inactive/failed states but
|
|
// always prints the state on stdout, so we read the trimmed output and
|
|
// ignore the exit code. Anything other than "active" is a fail with a
|
|
// systemctl-restart hint.
|
|
func addServiceActiveCheck(ctx context.Context, runner system.CommandRunner, report *system.Report, name, service string) {
|
|
out, _ := runner.Run(ctx, "systemctl", "is-active", service)
|
|
state := strings.TrimSpace(string(out))
|
|
if state == "" {
|
|
state = "unknown"
|
|
}
|
|
if state == "active" {
|
|
report.AddPass(name, fmt.Sprintf("%s is active", service))
|
|
return
|
|
}
|
|
report.AddFail(name,
|
|
fmt.Sprintf("%s is %s, not active", service, state),
|
|
fmt.Sprintf("run `sudo systemctl restart %s` and re-run `banger doctor`", service))
|
|
}
|
|
|
|
// addSocketPermsCheck stat()s the socket path and compares mode +
|
|
// owner against the values the install promises. Both daemon and
|
|
// helper sockets are 0600 chowned to the registered owner UID; any
|
|
// drift means filesystem perms aren't gating access the way the docs
|
|
// describe.
|
|
func addSocketPermsCheck(report *system.Report, name, path string, expectedUID int, expectedMode os.FileMode) {
|
|
info, err := os.Stat(path)
|
|
if err != nil {
|
|
report.AddFail(name,
|
|
fmt.Sprintf("%s: %v", path, err),
|
|
"is the service running? `sudo systemctl status` and check the runtime dir")
|
|
return
|
|
}
|
|
stat, ok := info.Sys().(*syscall.Stat_t)
|
|
if !ok {
|
|
report.AddWarn(name, fmt.Sprintf("%s: cannot read ownership metadata on this platform", path))
|
|
return
|
|
}
|
|
actualMode := info.Mode().Perm()
|
|
var problems []string
|
|
if actualMode != expectedMode {
|
|
problems = append(problems, fmt.Sprintf("mode is %#o, want %#o", actualMode, expectedMode))
|
|
}
|
|
if int(stat.Uid) != expectedUID {
|
|
problems = append(problems, fmt.Sprintf("uid is %d, want %d", stat.Uid, expectedUID))
|
|
}
|
|
if len(problems) > 0 {
|
|
problems = append(problems, "restart the service so the socket gets recreated with correct perms")
|
|
report.AddFail(name, fmt.Sprintf("%s: %s", path, strings.Join(problems, "; ")))
|
|
return
|
|
}
|
|
report.AddPass(name, fmt.Sprintf("%s: mode %#o, uid %d", path, actualMode, expectedUID))
|
|
}
|
|
|
|
// addUnitHardeningCheck reads the systemd unit file and confirms
|
|
// every required directive is present as a literal substring. Brittle
|
|
// to formatting changes (a comment-out would slip through), but
|
|
// strong enough to catch the "someone hand-edited the unit and
|
|
// dropped NoNewPrivileges" failure mode that motivates this check.
|
|
// The directives list captures the security-relevant subset of the
|
|
// renderer in commands_system.go; everything else (Description,
|
|
// ExecStart, etc.) is operational and not worth pinning here.
|
|
func addUnitHardeningCheck(report *system.Report, name, path string, required []string) {
|
|
data, err := os.ReadFile(path)
|
|
if err != nil {
|
|
report.AddFail(name,
|
|
fmt.Sprintf("%s: %v", path, err),
|
|
"reinstall via `sudo banger system install` to refresh the unit")
|
|
return
|
|
}
|
|
content := string(data)
|
|
var missing []string
|
|
for _, directive := range required {
|
|
if !strings.Contains(content, directive) {
|
|
missing = append(missing, directive)
|
|
}
|
|
}
|
|
if len(missing) > 0 {
|
|
report.AddFail(name,
|
|
fmt.Sprintf("%s missing directives: %s", path, strings.Join(missing, ", ")),
|
|
"reinstall via `sudo banger system install` to refresh the unit")
|
|
return
|
|
}
|
|
report.AddPass(name, fmt.Sprintf("%s: %d hardening directives present", path, len(required)))
|
|
}
|
|
|
|
// addExecutableOwnershipCheck mirrors validateRootExecutable's runtime
|
|
// check at doctor time: regular file, root-owned, executable, not
|
|
// group/world writable, not a symlink. Doctor catching this once at
|
|
// install time beats the helper failing every launch with a less
|
|
// helpful message.
|
|
func addExecutableOwnershipCheck(report *system.Report, name, path string) {
|
|
if strings.TrimSpace(path) == "" {
|
|
report.AddWarn(name, "no firecracker binary path configured")
|
|
return
|
|
}
|
|
info, err := os.Lstat(path)
|
|
if err != nil {
|
|
report.AddFail(name, fmt.Sprintf("%s: %v", path, err))
|
|
return
|
|
}
|
|
if info.Mode()&os.ModeSymlink != 0 {
|
|
report.AddFail(name,
|
|
fmt.Sprintf("%s is a symlink", path),
|
|
"the helper opens the binary with O_NOFOLLOW; resolve the symlink and update firecracker_bin in the daemon config")
|
|
return
|
|
}
|
|
if !info.Mode().IsRegular() {
|
|
report.AddFail(name, fmt.Sprintf("%s is not a regular file", path))
|
|
return
|
|
}
|
|
mode := info.Mode().Perm()
|
|
if mode&0o111 == 0 {
|
|
report.AddFail(name,
|
|
fmt.Sprintf("%s mode %#o is not executable", path, mode),
|
|
"chmod +x the binary")
|
|
return
|
|
}
|
|
if mode&0o022 != 0 {
|
|
report.AddFail(name,
|
|
fmt.Sprintf("%s mode %#o is group/world writable", path, mode),
|
|
"chmod g-w,o-w the binary so the helper accepts it")
|
|
return
|
|
}
|
|
stat, ok := info.Sys().(*syscall.Stat_t)
|
|
if !ok {
|
|
report.AddWarn(name, fmt.Sprintf("%s: cannot read ownership metadata on this platform", path))
|
|
return
|
|
}
|
|
if stat.Uid != 0 {
|
|
report.AddFail(name,
|
|
fmt.Sprintf("%s is owned by uid %d, want 0", path, stat.Uid),
|
|
"`sudo chown root` the firecracker binary")
|
|
return
|
|
}
|
|
report.AddPass(name, fmt.Sprintf("%s: regular, root-owned, mode %#o", path, mode))
|
|
}
|
|
|
|
// addSSHShortcutCheck surfaces a gentle warning when banger maintains
|
|
// an ssh_config file but the user hasn't wired it into ~/.ssh/config.
|
|
// This is intentionally a warn, not a fail — the shortcut is opt-in
|
|
// convenience and `banger vm ssh` works either way.
|
|
func (d *Daemon) addSSHShortcutCheck(report *system.Report) {
|
|
bangerConfig := BangerSSHConfigPath(d.userLayout)
|
|
if strings.TrimSpace(bangerConfig) == "" {
|
|
return
|
|
}
|
|
if _, err := os.Stat(bangerConfig); err != nil {
|
|
// No banger ssh_config rendered yet — nothing to include.
|
|
return
|
|
}
|
|
installed, err := UserSSHIncludeInstalled()
|
|
if err != nil {
|
|
report.AddWarn("ssh shortcut", fmt.Sprintf("could not read ~/.ssh/config: %v", err))
|
|
return
|
|
}
|
|
if installed {
|
|
report.AddPass("ssh shortcut", "enabled — `ssh <name>.vm` routes through banger")
|
|
return
|
|
}
|
|
report.AddWarn(
|
|
"ssh shortcut",
|
|
fmt.Sprintf("`ssh <name>.vm` not enabled (opt-in); run `banger ssh-config --install` or add `Include %s` to ~/.ssh/config", bangerConfig),
|
|
)
|
|
}
|
|
|
|
// addArchitectureCheck surfaces a hard-fail when banger is running on
|
|
// a non-amd64 host. Companion binaries are pinned to amd64 in the
|
|
// Makefile, the published kernel catalog ships only x86_64 images, and
|
|
// OCI import pulls linux/amd64 layers. Letting users discover this
|
|
// through cryptic downstream failures is worse than saying it up front.
|
|
func addArchitectureCheck(report *system.Report) {
|
|
if runtime.GOARCH == "amd64" {
|
|
report.AddPass("host architecture", "amd64")
|
|
return
|
|
}
|
|
report.AddFail(
|
|
"host architecture",
|
|
fmt.Sprintf("running on %s; banger today only supports amd64/x86_64 hosts", runtime.GOARCH),
|
|
"companion build, kernel catalog, and OCI import all assume linux/amd64",
|
|
)
|
|
}
|
|
|
|
// addVMDefaultsCheck surfaces the effective VM sizing that `vm run` /
|
|
// `vm create` will apply when the user omits the flags. Shown as a
|
|
// PASS check so it always renders, with per-field provenance
|
|
// (config|auto|builtin) so users can tell what's driving each number.
|
|
func (d *Daemon) addVMDefaultsCheck(report *system.Report) {
|
|
host, err := system.ReadHostResources()
|
|
var cpus int
|
|
var memBytes int64
|
|
if err == nil {
|
|
cpus = host.CPUCount
|
|
memBytes = host.TotalMemoryBytes
|
|
}
|
|
defaults := model.ResolveVMDefaults(d.config.VMDefaults, cpus, memBytes)
|
|
details := []string{
|
|
fmt.Sprintf("vcpu: %d (%s)", defaults.VCPUCount, defaults.VCPUSource),
|
|
fmt.Sprintf("memory: %d MiB (%s)", defaults.MemoryMiB, defaults.MemorySource),
|
|
fmt.Sprintf("disk: %s (%s)", model.FormatSizeBytes(defaults.WorkDiskSizeBytes), defaults.WorkDiskSource),
|
|
"override any of these in ~/.config/banger/config.toml under [vm_defaults]",
|
|
}
|
|
report.AddPass("vm defaults", details...)
|
|
}
|
|
|
|
func (d *Daemon) runtimeChecks() *system.Preflight {
|
|
checks := system.NewPreflight()
|
|
checks.RequireExecutable(d.config.FirecrackerBin, "firecracker binary", `install firecracker or set "firecracker_bin"`)
|
|
checks.RequireFile(d.config.SSHKeyPath, "ssh private key", `set "ssh_key_path" or let banger create its default key`)
|
|
if helper, err := vsockAgentBinary(d.layout); err == nil {
|
|
checks.RequireExecutable(helper, "vsock agent helper", `run 'make build' or reinstall banger`)
|
|
} else {
|
|
checks.Addf("%v", err)
|
|
}
|
|
if d.store != nil && strings.TrimSpace(d.config.DefaultImageName) != "" {
|
|
name := d.config.DefaultImageName
|
|
image, err := d.store.GetImageByName(context.Background(), name)
|
|
if err == nil {
|
|
checks.RequireFile(image.RootfsPath, "default image rootfs", `re-register or rebuild the default image`)
|
|
checks.RequireFile(image.KernelPath, "default image kernel", `re-register or rebuild the default image`)
|
|
if strings.TrimSpace(image.InitrdPath) != "" {
|
|
checks.RequireFile(image.InitrdPath, "default image initrd", `re-register or rebuild the default image`)
|
|
}
|
|
} else if !defaultImageInCatalog(name) {
|
|
checks.Addf("default image %q is not registered and not in the imagecat catalog", name)
|
|
}
|
|
// If the default image isn't local but is cataloged, vm create
|
|
// will auto-pull it on first use — no error to surface.
|
|
}
|
|
return checks
|
|
}
|
|
|
|
func defaultImageInCatalog(name string) bool {
|
|
catalog, err := imagecat.LoadEmbedded()
|
|
if err != nil {
|
|
return false
|
|
}
|
|
_, err = catalog.Lookup(name)
|
|
return err == nil
|
|
}
|
|
|
|
func (d *Daemon) coreVMLifecycleChecks() *system.Preflight {
|
|
checks := system.NewPreflight()
|
|
d.vm.addBaseStartCommandPrereqs(checks)
|
|
return checks
|
|
}
|
|
|
|
func (d *Daemon) vsockChecks() *system.Preflight {
|
|
checks := system.NewPreflight()
|
|
if helper, err := vsockAgentBinary(d.layout); err == nil {
|
|
checks.RequireExecutable(helper, "vsock agent helper", `run 'make build' or reinstall banger`)
|
|
} else {
|
|
checks.Addf("%v", err)
|
|
}
|
|
checks.RequireFile(d.vm.vsockHostDevice, "vsock host device", "load the vhost_vsock kernel module on the host")
|
|
return checks
|
|
}
|
|
|
|
func runtimeStatus(cfg model.DaemonConfig) string {
|
|
if strings.TrimSpace(cfg.FirecrackerBin) == "" {
|
|
return "firecracker not configured"
|
|
}
|
|
return "firecracker and ssh key resolved"
|
|
}
|