banger/internal/daemon/capabilities.go
Thales Maciel fa4292756d
daemon: surface previously-swallowed errors at warn
Three recovery-path errors were silently dropped:

- vm_lifecycle.go startVMLocked persisted the VMStateError record
  with `_ = s.store.UpsertVM(...)`. If the persist failed the user
  saw the original start error but operators had no way to find
  out the store had also drifted out of sync.
- vm_lifecycle.go deleteVMLocked killed the firecracker process
  with `_ = s.net.killVMProcess(...)`. cleanupRuntime tears it
  down regardless, so the explicit kill is best-effort, but a
  permission-denied / EPERM was still worth logging.
- capabilities.go cleanupPreparedCapabilities collected per-cap
  errors with errors.Join. Callers get the aggregated value but
  couldn't tell which capability failed when more than one did.

All three now log Warn before the original behaviour continues.
The aggregate return value, control flow, and user-visible error
strings are unchanged — this is purely a "less silence in the
journal" pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-26 22:30:51 -03:00

409 lines
13 KiB
Go

package daemon
import (
"context"
"errors"
"log/slog"
"net"
"os"
"strings"
"time"
"github.com/miekg/dns"
"banger/internal/firecracker"
"banger/internal/guestconfig"
"banger/internal/model"
"banger/internal/store"
"banger/internal/system"
"banger/internal/vmdns"
)
// vmCapability is the base capability tag. Actual behaviour lives on
// optional sub-interfaces (startPreflight / guestConfig / machineConfig
// / prepareHost / postStart / cleanup / configChange / doctor); a
// capability implements whichever subset it cares about. None of them
// take *Daemon — each capability is a struct constructed with its
// explicit service-pointer dependencies at wireServices time.
type vmCapability interface {
Name() string
}
type startPreflightCapability interface {
AddStartPreflight(context.Context, *system.Preflight, model.VMRecord, model.Image)
}
type guestConfigCapability interface {
ContributeGuest(*guestconfig.Builder, model.VMRecord, model.Image)
}
type machineConfigCapability interface {
ContributeMachine(*firecracker.MachineConfig, model.VMRecord, model.Image)
}
type prepareHostCapability interface {
PrepareHost(context.Context, *model.VMRecord, model.Image) error
}
type postStartCapability interface {
PostStart(context.Context, model.VMRecord, model.Image) error
}
type cleanupCapability interface {
Cleanup(context.Context, model.VMRecord) error
}
type configChangeCapability interface {
ApplyConfigChange(context.Context, model.VMRecord, model.VMRecord) error
}
type doctorCapability interface {
AddDoctorChecks(context.Context, *system.Report)
}
// defaultCapabilities builds the production capability list from
// already-constructed services. Called from wireServices once d.vm /
// d.ws / d.net are populated, so every capability ships with the
// concrete service pointers it needs and none of them reach through
// *Daemon at dispatch time.
func (d *Daemon) defaultCapabilities() []vmCapability {
return []vmCapability{
newWorkDiskCapability(d.vm, d.ws, d.store, d.config.DefaultImageName),
newDNSCapability(d.net),
newNATCapability(d.vm, d.net, d.logger),
}
}
func (d *Daemon) addCapabilityStartPrereqs(ctx context.Context, checks *system.Preflight, vm model.VMRecord, image model.Image) {
for _, capability := range d.vmCaps {
if hook, ok := capability.(startPreflightCapability); ok {
hook.AddStartPreflight(ctx, checks, vm, image)
}
}
}
func (d *Daemon) contributeGuestConfig(builder *guestconfig.Builder, vm model.VMRecord, image model.Image) {
for _, capability := range d.vmCaps {
if hook, ok := capability.(guestConfigCapability); ok {
hook.ContributeGuest(builder, vm, image)
}
}
}
func (d *Daemon) contributeMachineConfig(cfg *firecracker.MachineConfig, vm model.VMRecord, image model.Image) {
for _, capability := range d.vmCaps {
if hook, ok := capability.(machineConfigCapability); ok {
hook.ContributeMachine(cfg, vm, image)
}
}
}
func (d *Daemon) prepareCapabilityHosts(ctx context.Context, vm *model.VMRecord, image model.Image) error {
prepared := make([]vmCapability, 0, len(d.vmCaps))
for _, capability := range d.vmCaps {
hook, ok := capability.(prepareHostCapability)
if !ok {
continue
}
if err := hook.PrepareHost(ctx, vm, image); err != nil {
d.cleanupPreparedCapabilities(context.Background(), vm, prepared)
return err
}
prepared = append(prepared, capability)
}
return nil
}
func (d *Daemon) postStartCapabilities(ctx context.Context, vm model.VMRecord, image model.Image) error {
for _, capability := range d.vmCaps {
switch capability.Name() {
case "dns":
vmCreateStage(ctx, "apply_dns", "publishing vm dns record")
case "nat":
if vm.Spec.NATEnabled {
vmCreateStage(ctx, "apply_nat", "configuring nat")
}
}
if hook, ok := capability.(postStartCapability); ok {
if err := hook.PostStart(ctx, vm, image); err != nil {
return err
}
}
}
return nil
}
func (d *Daemon) cleanupCapabilityState(ctx context.Context, vm model.VMRecord) error {
return d.cleanupPreparedCapabilities(ctx, &vm, d.vmCaps)
}
func (d *Daemon) cleanupPreparedCapabilities(ctx context.Context, vm *model.VMRecord, capabilities []vmCapability) error {
var err error
for index := len(capabilities) - 1; index >= 0; index-- {
hook, ok := capabilities[index].(cleanupCapability)
if !ok {
continue
}
cleanupErr := hook.Cleanup(ctx, *vm)
if cleanupErr != nil && d.logger != nil {
// Log per-capability cleanup failures. The aggregate
// errors.Join return value is still the contract for
// callers, but a multi-failure cleanup hides which
// capability misbehaved unless we surface each one
// individually here.
d.logger.Warn("capability cleanup failed", append(vmLogAttrs(*vm), "capability", capabilities[index].Name(), "error", cleanupErr.Error())...)
}
err = joinErr(err, cleanupErr)
}
return err
}
func (d *Daemon) applyCapabilityConfigChanges(ctx context.Context, before, after model.VMRecord) error {
for _, capability := range d.vmCaps {
if hook, ok := capability.(configChangeCapability); ok {
if err := hook.ApplyConfigChange(ctx, before, after); err != nil {
return err
}
}
}
return nil
}
func (d *Daemon) addCapabilityDoctorChecks(ctx context.Context, report *system.Report) {
for _, capability := range d.vmCaps {
if hook, ok := capability.(doctorCapability); ok {
hook.AddDoctorChecks(ctx, report)
}
}
}
// workDiskCapability provisions a per-VM work disk (image-seeded or
// freshly formatted) and syncs host-side authorised keys + git
// identity + file_sync entries onto it. Holds pointers to the VM and
// workspace services because PrepareHost orchestrates across both,
// plus the store + default image name for its doctor check.
type workDiskCapability struct {
vm *VMService
ws *WorkspaceService
store *store.Store
defaultImageName string
}
func newWorkDiskCapability(vm *VMService, ws *WorkspaceService, st *store.Store, defaultImageName string) workDiskCapability {
return workDiskCapability{
vm: vm,
ws: ws,
store: st,
defaultImageName: defaultImageName,
}
}
func (workDiskCapability) Name() string { return "work-disk" }
func (workDiskCapability) AddStartPreflight(_ context.Context, checks *system.Preflight, vm model.VMRecord, image model.Image) {
if exists(vm.Runtime.WorkDiskPath) {
return
}
imageSeed := ""
if image.RootfsPath != "" {
imageSeed = image.WorkSeedPath
}
if exists(imageSeed) {
if info, err := os.Stat(imageSeed); err == nil && vm.Spec.WorkDiskSizeBytes > info.Size() {
checks.RequireCommand("e2fsck", toolHint("e2fsck"))
checks.RequireCommand("resize2fs", toolHint("resize2fs"))
}
return
}
for _, command := range []string{"mkfs.ext4", "mount", "umount", "cp"} {
checks.RequireCommand(command, toolHint(command))
}
}
func (workDiskCapability) ContributeGuest(builder *guestconfig.Builder, _ model.VMRecord, _ model.Image) {
builder.AddMount(guestconfig.MountSpec{
Source: "/dev/vdb",
Target: "/root",
FSType: "ext4",
Options: []string{"defaults"},
Dump: 0,
Pass: 2,
})
}
func (workDiskCapability) ContributeMachine(cfg *firecracker.MachineConfig, vm model.VMRecord, _ model.Image) {
cfg.Drives = append(cfg.Drives, firecracker.DriveConfig{
ID: "work",
Path: vm.Runtime.WorkDiskPath,
ReadOnly: false,
})
}
func (c workDiskCapability) PrepareHost(ctx context.Context, vm *model.VMRecord, image model.Image) error {
prep, err := c.vm.ensureWorkDisk(ctx, vm, image)
if err != nil {
return err
}
if err := c.ws.ensureAuthorizedKeyOnWorkDisk(ctx, vm, image, prep); err != nil {
return err
}
if err := c.ws.ensureGitIdentityOnWorkDisk(ctx, vm); err != nil {
return err
}
return c.ws.runFileSync(ctx, vm)
}
func (c workDiskCapability) AddDoctorChecks(_ context.Context, report *system.Report) {
if c.store != nil && strings.TrimSpace(c.defaultImageName) != "" {
if image, err := c.store.GetImageByName(context.Background(), c.defaultImageName); err == nil && strings.TrimSpace(image.WorkSeedPath) != "" && exists(image.WorkSeedPath) {
checks := system.NewPreflight()
checks.RequireFile(image.WorkSeedPath, "default image work-seed", `rebuild the default image to regenerate the /root seed`)
report.AddPreflight("feature /root work disk", checks, "seeded /root work disk artifact available")
return
}
}
checks := system.NewPreflight()
for _, command := range []string{"truncate", "mkfs.ext4"} {
checks.RequireCommand(command, toolHint(command))
}
report.AddPreflight("feature /root work disk", checks, "fallback /root work disk tooling available")
report.AddWarn("feature /root work disk", "default image has no work-seed artifact; guest /root will be empty until the image is rebuilt")
}
// dnsCapability publishes + removes <vm>.vm records on the in-process
// DNS server. Only needs HostNetwork.
type dnsCapability struct {
net *HostNetwork
}
func newDNSCapability(net *HostNetwork) dnsCapability {
return dnsCapability{net: net}
}
func (dnsCapability) Name() string { return "dns" }
func (c dnsCapability) PostStart(ctx context.Context, vm model.VMRecord, _ model.Image) error {
return c.net.setDNS(ctx, vm.Name, vm.Runtime.GuestIP)
}
func (c dnsCapability) Cleanup(_ context.Context, vm model.VMRecord) error {
return c.net.removeDNS(vm.Runtime.DNSName)
}
func (dnsCapability) AddDoctorChecks(_ context.Context, report *system.Report) {
conn, err := net.ListenPacket("udp", vmdns.DefaultListenAddr)
if err != nil {
if strings.Contains(strings.ToLower(err.Error()), "address already in use") {
// "Already in use" is the expected state when banger's own
// daemon is running. Probe the listener with a *.vm query
// the banger DNS server is the only thing on the host
// authoritative for, and pass if the response shape matches.
if probeBangerDNS(vmdns.DefaultListenAddr) {
report.AddPass("feature vm dns", "banger DNS server is already serving "+vmdns.DefaultListenAddr)
return
}
report.AddWarn("feature vm dns", "listener address "+vmdns.DefaultListenAddr+" is held by another process")
return
}
report.AddFail("feature vm dns", "cannot bind "+vmdns.DefaultListenAddr+": "+err.Error())
return
}
_ = conn.Close()
report.AddPass("feature vm dns", "listener can bind "+vmdns.DefaultListenAddr)
}
// probeBangerDNS returns true iff a UDP DNS query to addr is answered
// by something that behaves like banger's vmdns server: a *.vm name
// produces an authoritative NXDOMAIN. Any other listener (a stub
// resolver, a different DNS server) either refuses, recurses, or
// returns non-authoritative — all distinguishable from this probe.
func probeBangerDNS(addr string) bool {
client := &dns.Client{Net: "udp", Timeout: 500 * time.Millisecond}
req := new(dns.Msg)
req.SetQuestion("doctor-probe-not-a-real-vm.vm.", dns.TypeA)
resp, _, err := client.Exchange(req, addr)
if err != nil || resp == nil {
return false
}
return resp.Authoritative && resp.Rcode == dns.RcodeNameError
}
// natCapability sets up host-side NAT so guest traffic can reach the
// outside world. Needs VMService (tap lookup + aliveness) and
// HostNetwork (NAT rules), plus the daemon logger for the cleanup
// short-circuit note.
type natCapability struct {
vm *VMService
net *HostNetwork
logger *slog.Logger
}
func newNATCapability(vm *VMService, net *HostNetwork, logger *slog.Logger) natCapability {
return natCapability{vm: vm, net: net, logger: logger}
}
func (natCapability) Name() string { return "nat" }
func (c natCapability) AddStartPreflight(ctx context.Context, checks *system.Preflight, vm model.VMRecord, _ model.Image) {
if !vm.Spec.NATEnabled {
return
}
c.net.addNATPrereqs(ctx, checks)
}
func (c natCapability) PostStart(ctx context.Context, vm model.VMRecord, _ model.Image) error {
if !vm.Spec.NATEnabled {
return nil
}
return c.net.ensureNAT(ctx, vm.Runtime.GuestIP, c.vm.vmHandles(vm.ID).TapDevice, true)
}
func (c natCapability) Cleanup(ctx context.Context, vm model.VMRecord) error {
if !vm.Spec.NATEnabled {
return nil
}
// Handle cache is volatile across daemon restarts; Runtime is
// the persisted DB-backed copy. Fall back so a crash / corrupt
// handles.json doesn't leak iptables rules keyed off the tap.
tap := strings.TrimSpace(c.vm.vmHandles(vm.ID).TapDevice)
if tap == "" {
tap = strings.TrimSpace(vm.Runtime.TapDevice)
}
if strings.TrimSpace(vm.Runtime.GuestIP) == "" || tap == "" {
if c.logger != nil {
c.logger.Debug("skipping nat cleanup without runtime network handles", append(vmLogAttrs(vm), "guest_ip", vm.Runtime.GuestIP, "tap_device", tap)...)
}
return nil
}
return c.net.ensureNAT(ctx, vm.Runtime.GuestIP, tap, false)
}
func (c natCapability) ApplyConfigChange(ctx context.Context, before, after model.VMRecord) error {
if before.Spec.NATEnabled == after.Spec.NATEnabled {
return nil
}
if !c.vm.vmAlive(after) {
return nil
}
return c.net.ensureNAT(ctx, after.Runtime.GuestIP, c.vm.vmHandles(after.ID).TapDevice, after.Spec.NATEnabled)
}
func (c natCapability) AddDoctorChecks(ctx context.Context, report *system.Report) {
checks := system.NewPreflight()
checks.RequireCommand("ip", toolHint("ip"))
c.net.addNATPrereqs(ctx, checks)
if len(checks.Problems()) > 0 {
report.Add(system.CheckStatusFail, "feature nat", checks.Problems()...)
return
}
uplink, err := c.net.defaultUplink(ctx)
if err != nil {
report.AddFail("feature nat", err.Error())
return
}
report.AddPass("feature nat", "iptables/sysctl available, uplink "+uplink)
}
func joinErr(current, next error) error {
return errors.Join(current, next)
}