The previous check tried to bind 127.0.0.1:42069 and warned on 'address already in use' — which is exactly the state when the banger daemon is running, the case the user ran 'doctor' to confirm. The warning was actively misleading. Now, on 'address already in use', probe the listener with a *.vm DNS query that only banger's vmdns server answers authoritatively (NXDOMAIN with Authoritative=true). If the shape matches we pass; if the port is held by something else we still warn. Tests cover both branches: a real vmdns server is accepted, and a silent UDP listener on the same port is rejected. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
400 lines
13 KiB
Go
400 lines
13 KiB
Go
package daemon
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"log/slog"
|
|
"net"
|
|
"os"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/miekg/dns"
|
|
|
|
"banger/internal/firecracker"
|
|
"banger/internal/guestconfig"
|
|
"banger/internal/model"
|
|
"banger/internal/store"
|
|
"banger/internal/system"
|
|
"banger/internal/vmdns"
|
|
)
|
|
|
|
// vmCapability is the base capability tag. Actual behaviour lives on
|
|
// optional sub-interfaces (startPreflight / guestConfig / machineConfig
|
|
// / prepareHost / postStart / cleanup / configChange / doctor); a
|
|
// capability implements whichever subset it cares about. None of them
|
|
// take *Daemon — each capability is a struct constructed with its
|
|
// explicit service-pointer dependencies at wireServices time.
|
|
type vmCapability interface {
|
|
Name() string
|
|
}
|
|
|
|
type startPreflightCapability interface {
|
|
AddStartPreflight(context.Context, *system.Preflight, model.VMRecord, model.Image)
|
|
}
|
|
|
|
type guestConfigCapability interface {
|
|
ContributeGuest(*guestconfig.Builder, model.VMRecord, model.Image)
|
|
}
|
|
|
|
type machineConfigCapability interface {
|
|
ContributeMachine(*firecracker.MachineConfig, model.VMRecord, model.Image)
|
|
}
|
|
|
|
type prepareHostCapability interface {
|
|
PrepareHost(context.Context, *model.VMRecord, model.Image) error
|
|
}
|
|
|
|
type postStartCapability interface {
|
|
PostStart(context.Context, model.VMRecord, model.Image) error
|
|
}
|
|
|
|
type cleanupCapability interface {
|
|
Cleanup(context.Context, model.VMRecord) error
|
|
}
|
|
|
|
type configChangeCapability interface {
|
|
ApplyConfigChange(context.Context, model.VMRecord, model.VMRecord) error
|
|
}
|
|
|
|
type doctorCapability interface {
|
|
AddDoctorChecks(context.Context, *system.Report)
|
|
}
|
|
|
|
// defaultCapabilities builds the production capability list from
|
|
// already-constructed services. Called from wireServices once d.vm /
|
|
// d.ws / d.net are populated, so every capability ships with the
|
|
// concrete service pointers it needs and none of them reach through
|
|
// *Daemon at dispatch time.
|
|
func (d *Daemon) defaultCapabilities() []vmCapability {
|
|
return []vmCapability{
|
|
newWorkDiskCapability(d.vm, d.ws, d.store, d.config.DefaultImageName),
|
|
newDNSCapability(d.net),
|
|
newNATCapability(d.vm, d.net, d.logger),
|
|
}
|
|
}
|
|
|
|
func (d *Daemon) addCapabilityStartPrereqs(ctx context.Context, checks *system.Preflight, vm model.VMRecord, image model.Image) {
|
|
for _, capability := range d.vmCaps {
|
|
if hook, ok := capability.(startPreflightCapability); ok {
|
|
hook.AddStartPreflight(ctx, checks, vm, image)
|
|
}
|
|
}
|
|
}
|
|
|
|
func (d *Daemon) contributeGuestConfig(builder *guestconfig.Builder, vm model.VMRecord, image model.Image) {
|
|
for _, capability := range d.vmCaps {
|
|
if hook, ok := capability.(guestConfigCapability); ok {
|
|
hook.ContributeGuest(builder, vm, image)
|
|
}
|
|
}
|
|
}
|
|
|
|
func (d *Daemon) contributeMachineConfig(cfg *firecracker.MachineConfig, vm model.VMRecord, image model.Image) {
|
|
for _, capability := range d.vmCaps {
|
|
if hook, ok := capability.(machineConfigCapability); ok {
|
|
hook.ContributeMachine(cfg, vm, image)
|
|
}
|
|
}
|
|
}
|
|
|
|
func (d *Daemon) prepareCapabilityHosts(ctx context.Context, vm *model.VMRecord, image model.Image) error {
|
|
prepared := make([]vmCapability, 0, len(d.vmCaps))
|
|
for _, capability := range d.vmCaps {
|
|
hook, ok := capability.(prepareHostCapability)
|
|
if !ok {
|
|
continue
|
|
}
|
|
if err := hook.PrepareHost(ctx, vm, image); err != nil {
|
|
d.cleanupPreparedCapabilities(context.Background(), vm, prepared)
|
|
return err
|
|
}
|
|
prepared = append(prepared, capability)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (d *Daemon) postStartCapabilities(ctx context.Context, vm model.VMRecord, image model.Image) error {
|
|
for _, capability := range d.vmCaps {
|
|
switch capability.Name() {
|
|
case "dns":
|
|
vmCreateStage(ctx, "apply_dns", "publishing vm dns record")
|
|
case "nat":
|
|
if vm.Spec.NATEnabled {
|
|
vmCreateStage(ctx, "apply_nat", "configuring nat")
|
|
}
|
|
}
|
|
if hook, ok := capability.(postStartCapability); ok {
|
|
if err := hook.PostStart(ctx, vm, image); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (d *Daemon) cleanupCapabilityState(ctx context.Context, vm model.VMRecord) error {
|
|
return d.cleanupPreparedCapabilities(ctx, &vm, d.vmCaps)
|
|
}
|
|
|
|
func (d *Daemon) cleanupPreparedCapabilities(ctx context.Context, vm *model.VMRecord, capabilities []vmCapability) error {
|
|
var err error
|
|
for index := len(capabilities) - 1; index >= 0; index-- {
|
|
hook, ok := capabilities[index].(cleanupCapability)
|
|
if !ok {
|
|
continue
|
|
}
|
|
err = joinErr(err, hook.Cleanup(ctx, *vm))
|
|
}
|
|
return err
|
|
}
|
|
|
|
func (d *Daemon) applyCapabilityConfigChanges(ctx context.Context, before, after model.VMRecord) error {
|
|
for _, capability := range d.vmCaps {
|
|
if hook, ok := capability.(configChangeCapability); ok {
|
|
if err := hook.ApplyConfigChange(ctx, before, after); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (d *Daemon) addCapabilityDoctorChecks(ctx context.Context, report *system.Report) {
|
|
for _, capability := range d.vmCaps {
|
|
if hook, ok := capability.(doctorCapability); ok {
|
|
hook.AddDoctorChecks(ctx, report)
|
|
}
|
|
}
|
|
}
|
|
|
|
// workDiskCapability provisions a per-VM work disk (image-seeded or
|
|
// freshly formatted) and syncs host-side authorised keys + git
|
|
// identity + file_sync entries onto it. Holds pointers to the VM and
|
|
// workspace services because PrepareHost orchestrates across both,
|
|
// plus the store + default image name for its doctor check.
|
|
type workDiskCapability struct {
|
|
vm *VMService
|
|
ws *WorkspaceService
|
|
store *store.Store
|
|
defaultImageName string
|
|
}
|
|
|
|
func newWorkDiskCapability(vm *VMService, ws *WorkspaceService, st *store.Store, defaultImageName string) workDiskCapability {
|
|
return workDiskCapability{
|
|
vm: vm,
|
|
ws: ws,
|
|
store: st,
|
|
defaultImageName: defaultImageName,
|
|
}
|
|
}
|
|
|
|
func (workDiskCapability) Name() string { return "work-disk" }
|
|
|
|
func (workDiskCapability) AddStartPreflight(_ context.Context, checks *system.Preflight, vm model.VMRecord, image model.Image) {
|
|
if exists(vm.Runtime.WorkDiskPath) {
|
|
return
|
|
}
|
|
imageSeed := ""
|
|
if image.RootfsPath != "" {
|
|
imageSeed = image.WorkSeedPath
|
|
}
|
|
if exists(imageSeed) {
|
|
if info, err := os.Stat(imageSeed); err == nil && vm.Spec.WorkDiskSizeBytes > info.Size() {
|
|
checks.RequireCommand("e2fsck", toolHint("e2fsck"))
|
|
checks.RequireCommand("resize2fs", toolHint("resize2fs"))
|
|
}
|
|
return
|
|
}
|
|
for _, command := range []string{"mkfs.ext4", "mount", "umount", "cp"} {
|
|
checks.RequireCommand(command, toolHint(command))
|
|
}
|
|
}
|
|
|
|
func (workDiskCapability) ContributeGuest(builder *guestconfig.Builder, _ model.VMRecord, _ model.Image) {
|
|
builder.AddMount(guestconfig.MountSpec{
|
|
Source: "/dev/vdb",
|
|
Target: "/root",
|
|
FSType: "ext4",
|
|
Options: []string{"defaults"},
|
|
Dump: 0,
|
|
Pass: 2,
|
|
})
|
|
}
|
|
|
|
func (workDiskCapability) ContributeMachine(cfg *firecracker.MachineConfig, vm model.VMRecord, _ model.Image) {
|
|
cfg.Drives = append(cfg.Drives, firecracker.DriveConfig{
|
|
ID: "work",
|
|
Path: vm.Runtime.WorkDiskPath,
|
|
ReadOnly: false,
|
|
})
|
|
}
|
|
|
|
func (c workDiskCapability) PrepareHost(ctx context.Context, vm *model.VMRecord, image model.Image) error {
|
|
prep, err := c.vm.ensureWorkDisk(ctx, vm, image)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if err := c.ws.ensureAuthorizedKeyOnWorkDisk(ctx, vm, image, prep); err != nil {
|
|
return err
|
|
}
|
|
if err := c.ws.ensureGitIdentityOnWorkDisk(ctx, vm); err != nil {
|
|
return err
|
|
}
|
|
return c.ws.runFileSync(ctx, vm)
|
|
}
|
|
|
|
func (c workDiskCapability) AddDoctorChecks(_ context.Context, report *system.Report) {
|
|
if c.store != nil && strings.TrimSpace(c.defaultImageName) != "" {
|
|
if image, err := c.store.GetImageByName(context.Background(), c.defaultImageName); err == nil && strings.TrimSpace(image.WorkSeedPath) != "" && exists(image.WorkSeedPath) {
|
|
checks := system.NewPreflight()
|
|
checks.RequireFile(image.WorkSeedPath, "default image work-seed", `rebuild the default image to regenerate the /root seed`)
|
|
report.AddPreflight("feature /root work disk", checks, "seeded /root work disk artifact available")
|
|
return
|
|
}
|
|
}
|
|
checks := system.NewPreflight()
|
|
for _, command := range []string{"truncate", "mkfs.ext4"} {
|
|
checks.RequireCommand(command, toolHint(command))
|
|
}
|
|
report.AddPreflight("feature /root work disk", checks, "fallback /root work disk tooling available")
|
|
report.AddWarn("feature /root work disk", "default image has no work-seed artifact; guest /root will be empty until the image is rebuilt")
|
|
}
|
|
|
|
// dnsCapability publishes + removes <vm>.vm records on the in-process
|
|
// DNS server. Only needs HostNetwork.
|
|
type dnsCapability struct {
|
|
net *HostNetwork
|
|
}
|
|
|
|
func newDNSCapability(net *HostNetwork) dnsCapability {
|
|
return dnsCapability{net: net}
|
|
}
|
|
|
|
func (dnsCapability) Name() string { return "dns" }
|
|
|
|
func (c dnsCapability) PostStart(ctx context.Context, vm model.VMRecord, _ model.Image) error {
|
|
return c.net.setDNS(ctx, vm.Name, vm.Runtime.GuestIP)
|
|
}
|
|
|
|
func (c dnsCapability) Cleanup(_ context.Context, vm model.VMRecord) error {
|
|
return c.net.removeDNS(vm.Runtime.DNSName)
|
|
}
|
|
|
|
func (dnsCapability) AddDoctorChecks(_ context.Context, report *system.Report) {
|
|
conn, err := net.ListenPacket("udp", vmdns.DefaultListenAddr)
|
|
if err != nil {
|
|
if strings.Contains(strings.ToLower(err.Error()), "address already in use") {
|
|
// "Already in use" is the expected state when banger's own
|
|
// daemon is running. Probe the listener with a *.vm query
|
|
// the banger DNS server is the only thing on the host
|
|
// authoritative for, and pass if the response shape matches.
|
|
if probeBangerDNS(vmdns.DefaultListenAddr) {
|
|
report.AddPass("feature vm dns", "banger DNS server is already serving "+vmdns.DefaultListenAddr)
|
|
return
|
|
}
|
|
report.AddWarn("feature vm dns", "listener address "+vmdns.DefaultListenAddr+" is held by another process")
|
|
return
|
|
}
|
|
report.AddFail("feature vm dns", "cannot bind "+vmdns.DefaultListenAddr+": "+err.Error())
|
|
return
|
|
}
|
|
_ = conn.Close()
|
|
report.AddPass("feature vm dns", "listener can bind "+vmdns.DefaultListenAddr)
|
|
}
|
|
|
|
// probeBangerDNS returns true iff a UDP DNS query to addr is answered
|
|
// by something that behaves like banger's vmdns server: a *.vm name
|
|
// produces an authoritative NXDOMAIN. Any other listener (a stub
|
|
// resolver, a different DNS server) either refuses, recurses, or
|
|
// returns non-authoritative — all distinguishable from this probe.
|
|
func probeBangerDNS(addr string) bool {
|
|
client := &dns.Client{Net: "udp", Timeout: 500 * time.Millisecond}
|
|
req := new(dns.Msg)
|
|
req.SetQuestion("doctor-probe-not-a-real-vm.vm.", dns.TypeA)
|
|
resp, _, err := client.Exchange(req, addr)
|
|
if err != nil || resp == nil {
|
|
return false
|
|
}
|
|
return resp.Authoritative && resp.Rcode == dns.RcodeNameError
|
|
}
|
|
|
|
// natCapability sets up host-side NAT so guest traffic can reach the
|
|
// outside world. Needs VMService (tap lookup + aliveness) and
|
|
// HostNetwork (NAT rules), plus the daemon logger for the cleanup
|
|
// short-circuit note.
|
|
type natCapability struct {
|
|
vm *VMService
|
|
net *HostNetwork
|
|
logger *slog.Logger
|
|
}
|
|
|
|
func newNATCapability(vm *VMService, net *HostNetwork, logger *slog.Logger) natCapability {
|
|
return natCapability{vm: vm, net: net, logger: logger}
|
|
}
|
|
|
|
func (natCapability) Name() string { return "nat" }
|
|
|
|
func (c natCapability) AddStartPreflight(ctx context.Context, checks *system.Preflight, vm model.VMRecord, _ model.Image) {
|
|
if !vm.Spec.NATEnabled {
|
|
return
|
|
}
|
|
c.net.addNATPrereqs(ctx, checks)
|
|
}
|
|
|
|
func (c natCapability) PostStart(ctx context.Context, vm model.VMRecord, _ model.Image) error {
|
|
if !vm.Spec.NATEnabled {
|
|
return nil
|
|
}
|
|
return c.net.ensureNAT(ctx, vm.Runtime.GuestIP, c.vm.vmHandles(vm.ID).TapDevice, true)
|
|
}
|
|
|
|
func (c natCapability) Cleanup(ctx context.Context, vm model.VMRecord) error {
|
|
if !vm.Spec.NATEnabled {
|
|
return nil
|
|
}
|
|
// Handle cache is volatile across daemon restarts; Runtime is
|
|
// the persisted DB-backed copy. Fall back so a crash / corrupt
|
|
// handles.json doesn't leak iptables rules keyed off the tap.
|
|
tap := strings.TrimSpace(c.vm.vmHandles(vm.ID).TapDevice)
|
|
if tap == "" {
|
|
tap = strings.TrimSpace(vm.Runtime.TapDevice)
|
|
}
|
|
if strings.TrimSpace(vm.Runtime.GuestIP) == "" || tap == "" {
|
|
if c.logger != nil {
|
|
c.logger.Debug("skipping nat cleanup without runtime network handles", append(vmLogAttrs(vm), "guest_ip", vm.Runtime.GuestIP, "tap_device", tap)...)
|
|
}
|
|
return nil
|
|
}
|
|
return c.net.ensureNAT(ctx, vm.Runtime.GuestIP, tap, false)
|
|
}
|
|
|
|
func (c natCapability) ApplyConfigChange(ctx context.Context, before, after model.VMRecord) error {
|
|
if before.Spec.NATEnabled == after.Spec.NATEnabled {
|
|
return nil
|
|
}
|
|
if !c.vm.vmAlive(after) {
|
|
return nil
|
|
}
|
|
return c.net.ensureNAT(ctx, after.Runtime.GuestIP, c.vm.vmHandles(after.ID).TapDevice, after.Spec.NATEnabled)
|
|
}
|
|
|
|
func (c natCapability) AddDoctorChecks(ctx context.Context, report *system.Report) {
|
|
checks := system.NewPreflight()
|
|
checks.RequireCommand("ip", toolHint("ip"))
|
|
c.net.addNATPrereqs(ctx, checks)
|
|
if len(checks.Problems()) > 0 {
|
|
report.Add(system.CheckStatusFail, "feature nat", checks.Problems()...)
|
|
return
|
|
}
|
|
uplink, err := c.net.defaultUplink(ctx)
|
|
if err != nil {
|
|
report.AddFail("feature nat", err.Error())
|
|
return
|
|
}
|
|
report.AddPass("feature nat", "iptables/sysctl available, uplink "+uplink)
|
|
}
|
|
|
|
func joinErr(current, next error) error {
|
|
return errors.Join(current, next)
|
|
}
|