banger/internal/daemon/capabilities.go
Thales Maciel 5eceebe49f
daemon: persist tap device on VM.Runtime so NAT teardown survives handle-cache loss
Cleanup identity for kernel objects was split across two sources of
truth: vm.Runtime (DB-backed, durable) held paths and the guest IP,
but the TAP name lived only in the in-process handle cache + the
best-effort handles.json scratch file next to the VM dir. Every
other cleanup-identifying datum has a fallback — firecracker PID
can be rediscovered via `pgrep -f <apiSock>`, loops via losetup, dm
name from the deterministic ShortID(vm.ID). The tap is the one
truly cache-only datum (allocated from a pool, not derivable).

That made NAT teardown fragile:

  - daemon crash between `acquireTap` and the handles.json write
  - handles.json corrupt on the next daemon start
  - partial cleanup that already zeroed the cache

In any of those cases natCapability.Cleanup short-circuited
("skipping nat cleanup without runtime network handles") and the
per-VM POSTROUTING MASQUERADE + the two FORWARD rules keyed off
the tap would leak. The VM row in the DB still existed, so a retry
couldn't close the loop — the tap name was simply gone.

Fix: mirror TapDevice onto model.VMRuntime (serialised via the
existing runtime_json column, omitempty so existing rows upgrade
cleanly). Set it in startVMLocked right next to the
s.setVMHandles call that seeds the in-memory cache; clear it at
every post-cleanup reset site (stop normal path + stop stale
branch, kill normal path + kill stale branch, cleanupOnErr in
start, reconcile's stale-vm branch, the stats poller's auto-stop
path).

Fallbacks now cascade:

  - natCapability.Cleanup: handles cache → Runtime.TapDevice
  - cleanupRuntime (releaseTap): handles cache → Runtime.TapDevice

Both surfaces refuse gracefully (old behaviour) only when neither
source has a value, which really does mean "no tap was ever
allocated for this VM" rather than "we lost track of it."

Test: TestNATCapabilityCleanup_FallsBackToRuntimeTapDevice clears
the handle cache, sets vm.Runtime.TapDevice, and asserts Cleanup
reaches the runner — the exact scenario the review flagged as a
plausible leak and the exact code path that now guarantees it
doesn't.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-23 14:21:13 -03:00

373 lines
12 KiB
Go

package daemon
import (
"context"
"errors"
"log/slog"
"net"
"os"
"strings"
"banger/internal/firecracker"
"banger/internal/guestconfig"
"banger/internal/model"
"banger/internal/store"
"banger/internal/system"
"banger/internal/vmdns"
)
// vmCapability is the base capability tag. Actual behaviour lives on
// optional sub-interfaces (startPreflight / guestConfig / machineConfig
// / prepareHost / postStart / cleanup / configChange / doctor); a
// capability implements whichever subset it cares about. None of them
// take *Daemon — each capability is a struct constructed with its
// explicit service-pointer dependencies at wireServices time.
type vmCapability interface {
Name() string
}
type startPreflightCapability interface {
AddStartPreflight(context.Context, *system.Preflight, model.VMRecord, model.Image)
}
type guestConfigCapability interface {
ContributeGuest(*guestconfig.Builder, model.VMRecord, model.Image)
}
type machineConfigCapability interface {
ContributeMachine(*firecracker.MachineConfig, model.VMRecord, model.Image)
}
type prepareHostCapability interface {
PrepareHost(context.Context, *model.VMRecord, model.Image) error
}
type postStartCapability interface {
PostStart(context.Context, model.VMRecord, model.Image) error
}
type cleanupCapability interface {
Cleanup(context.Context, model.VMRecord) error
}
type configChangeCapability interface {
ApplyConfigChange(context.Context, model.VMRecord, model.VMRecord) error
}
type doctorCapability interface {
AddDoctorChecks(context.Context, *system.Report)
}
// defaultCapabilities builds the production capability list from
// already-constructed services. Called from wireServices once d.vm /
// d.ws / d.net are populated, so every capability ships with the
// concrete service pointers it needs and none of them reach through
// *Daemon at dispatch time.
func (d *Daemon) defaultCapabilities() []vmCapability {
return []vmCapability{
newWorkDiskCapability(d.vm, d.ws, d.store, d.config.DefaultImageName),
newDNSCapability(d.net),
newNATCapability(d.vm, d.net, d.logger),
}
}
func (d *Daemon) addCapabilityStartPrereqs(ctx context.Context, checks *system.Preflight, vm model.VMRecord, image model.Image) {
for _, capability := range d.vmCaps {
if hook, ok := capability.(startPreflightCapability); ok {
hook.AddStartPreflight(ctx, checks, vm, image)
}
}
}
func (d *Daemon) contributeGuestConfig(builder *guestconfig.Builder, vm model.VMRecord, image model.Image) {
for _, capability := range d.vmCaps {
if hook, ok := capability.(guestConfigCapability); ok {
hook.ContributeGuest(builder, vm, image)
}
}
}
func (d *Daemon) contributeMachineConfig(cfg *firecracker.MachineConfig, vm model.VMRecord, image model.Image) {
for _, capability := range d.vmCaps {
if hook, ok := capability.(machineConfigCapability); ok {
hook.ContributeMachine(cfg, vm, image)
}
}
}
func (d *Daemon) prepareCapabilityHosts(ctx context.Context, vm *model.VMRecord, image model.Image) error {
prepared := make([]vmCapability, 0, len(d.vmCaps))
for _, capability := range d.vmCaps {
hook, ok := capability.(prepareHostCapability)
if !ok {
continue
}
if err := hook.PrepareHost(ctx, vm, image); err != nil {
d.cleanupPreparedCapabilities(context.Background(), vm, prepared)
return err
}
prepared = append(prepared, capability)
}
return nil
}
func (d *Daemon) postStartCapabilities(ctx context.Context, vm model.VMRecord, image model.Image) error {
for _, capability := range d.vmCaps {
switch capability.Name() {
case "dns":
vmCreateStage(ctx, "apply_dns", "publishing vm dns record")
case "nat":
if vm.Spec.NATEnabled {
vmCreateStage(ctx, "apply_nat", "configuring nat")
}
}
if hook, ok := capability.(postStartCapability); ok {
if err := hook.PostStart(ctx, vm, image); err != nil {
return err
}
}
}
return nil
}
func (d *Daemon) cleanupCapabilityState(ctx context.Context, vm model.VMRecord) error {
return d.cleanupPreparedCapabilities(ctx, &vm, d.vmCaps)
}
func (d *Daemon) cleanupPreparedCapabilities(ctx context.Context, vm *model.VMRecord, capabilities []vmCapability) error {
var err error
for index := len(capabilities) - 1; index >= 0; index-- {
hook, ok := capabilities[index].(cleanupCapability)
if !ok {
continue
}
err = joinErr(err, hook.Cleanup(ctx, *vm))
}
return err
}
func (d *Daemon) applyCapabilityConfigChanges(ctx context.Context, before, after model.VMRecord) error {
for _, capability := range d.vmCaps {
if hook, ok := capability.(configChangeCapability); ok {
if err := hook.ApplyConfigChange(ctx, before, after); err != nil {
return err
}
}
}
return nil
}
func (d *Daemon) addCapabilityDoctorChecks(ctx context.Context, report *system.Report) {
for _, capability := range d.vmCaps {
if hook, ok := capability.(doctorCapability); ok {
hook.AddDoctorChecks(ctx, report)
}
}
}
// workDiskCapability provisions a per-VM work disk (image-seeded or
// freshly formatted) and syncs host-side authorised keys + git
// identity + file_sync entries onto it. Holds pointers to the VM and
// workspace services because PrepareHost orchestrates across both,
// plus the store + default image name for its doctor check.
type workDiskCapability struct {
vm *VMService
ws *WorkspaceService
store *store.Store
defaultImageName string
}
func newWorkDiskCapability(vm *VMService, ws *WorkspaceService, st *store.Store, defaultImageName string) workDiskCapability {
return workDiskCapability{
vm: vm,
ws: ws,
store: st,
defaultImageName: defaultImageName,
}
}
func (workDiskCapability) Name() string { return "work-disk" }
func (workDiskCapability) AddStartPreflight(_ context.Context, checks *system.Preflight, vm model.VMRecord, image model.Image) {
if exists(vm.Runtime.WorkDiskPath) {
return
}
imageSeed := ""
if image.RootfsPath != "" {
imageSeed = image.WorkSeedPath
}
if exists(imageSeed) {
if info, err := os.Stat(imageSeed); err == nil && vm.Spec.WorkDiskSizeBytes > info.Size() {
checks.RequireCommand("e2fsck", toolHint("e2fsck"))
checks.RequireCommand("resize2fs", toolHint("resize2fs"))
}
return
}
for _, command := range []string{"mkfs.ext4", "mount", "umount", "cp"} {
checks.RequireCommand(command, toolHint(command))
}
}
func (workDiskCapability) ContributeGuest(builder *guestconfig.Builder, _ model.VMRecord, _ model.Image) {
builder.AddMount(guestconfig.MountSpec{
Source: "/dev/vdb",
Target: "/root",
FSType: "ext4",
Options: []string{"defaults"},
Dump: 0,
Pass: 2,
})
}
func (workDiskCapability) ContributeMachine(cfg *firecracker.MachineConfig, vm model.VMRecord, _ model.Image) {
cfg.Drives = append(cfg.Drives, firecracker.DriveConfig{
ID: "work",
Path: vm.Runtime.WorkDiskPath,
ReadOnly: false,
})
}
func (c workDiskCapability) PrepareHost(ctx context.Context, vm *model.VMRecord, image model.Image) error {
prep, err := c.vm.ensureWorkDisk(ctx, vm, image)
if err != nil {
return err
}
if err := c.ws.ensureAuthorizedKeyOnWorkDisk(ctx, vm, image, prep); err != nil {
return err
}
if err := c.ws.ensureGitIdentityOnWorkDisk(ctx, vm); err != nil {
return err
}
return c.ws.runFileSync(ctx, vm)
}
func (c workDiskCapability) AddDoctorChecks(_ context.Context, report *system.Report) {
if c.store != nil && strings.TrimSpace(c.defaultImageName) != "" {
if image, err := c.store.GetImageByName(context.Background(), c.defaultImageName); err == nil && strings.TrimSpace(image.WorkSeedPath) != "" && exists(image.WorkSeedPath) {
checks := system.NewPreflight()
checks.RequireFile(image.WorkSeedPath, "default image work-seed", `rebuild the default image to regenerate the /root seed`)
report.AddPreflight("feature /root work disk", checks, "seeded /root work disk artifact available")
return
}
}
checks := system.NewPreflight()
for _, command := range []string{"mkfs.ext4", "mount", "umount", "cp"} {
checks.RequireCommand(command, toolHint(command))
}
report.AddPreflight("feature /root work disk", checks, "fallback /root work disk tooling available")
report.AddWarn("feature /root work disk", "default image has no work-seed artifact; new VM creates will be slower until the image is rebuilt")
}
// dnsCapability publishes + removes <vm>.vm records on the in-process
// DNS server. Only needs HostNetwork.
type dnsCapability struct {
net *HostNetwork
}
func newDNSCapability(net *HostNetwork) dnsCapability {
return dnsCapability{net: net}
}
func (dnsCapability) Name() string { return "dns" }
func (c dnsCapability) PostStart(ctx context.Context, vm model.VMRecord, _ model.Image) error {
return c.net.setDNS(ctx, vm.Name, vm.Runtime.GuestIP)
}
func (c dnsCapability) Cleanup(_ context.Context, vm model.VMRecord) error {
return c.net.removeDNS(vm.Runtime.DNSName)
}
func (dnsCapability) AddDoctorChecks(_ context.Context, report *system.Report) {
conn, err := net.ListenPacket("udp", vmdns.DefaultListenAddr)
if err != nil {
if strings.Contains(strings.ToLower(err.Error()), "address already in use") {
report.AddWarn("feature vm dns", "listener address "+vmdns.DefaultListenAddr+" is already in use")
return
}
report.AddFail("feature vm dns", "cannot bind "+vmdns.DefaultListenAddr+": "+err.Error())
return
}
_ = conn.Close()
report.AddPass("feature vm dns", "listener can bind "+vmdns.DefaultListenAddr)
}
// natCapability sets up host-side NAT so guest traffic can reach the
// outside world. Needs VMService (tap lookup + aliveness) and
// HostNetwork (NAT rules), plus the daemon logger for the cleanup
// short-circuit note.
type natCapability struct {
vm *VMService
net *HostNetwork
logger *slog.Logger
}
func newNATCapability(vm *VMService, net *HostNetwork, logger *slog.Logger) natCapability {
return natCapability{vm: vm, net: net, logger: logger}
}
func (natCapability) Name() string { return "nat" }
func (c natCapability) AddStartPreflight(ctx context.Context, checks *system.Preflight, vm model.VMRecord, _ model.Image) {
if !vm.Spec.NATEnabled {
return
}
c.net.addNATPrereqs(ctx, checks)
}
func (c natCapability) PostStart(ctx context.Context, vm model.VMRecord, _ model.Image) error {
if !vm.Spec.NATEnabled {
return nil
}
return c.net.ensureNAT(ctx, vm.Runtime.GuestIP, c.vm.vmHandles(vm.ID).TapDevice, true)
}
func (c natCapability) Cleanup(ctx context.Context, vm model.VMRecord) error {
if !vm.Spec.NATEnabled {
return nil
}
// Handle cache is volatile across daemon restarts; Runtime is
// the persisted DB-backed copy. Fall back so a crash / corrupt
// handles.json doesn't leak iptables rules keyed off the tap.
tap := strings.TrimSpace(c.vm.vmHandles(vm.ID).TapDevice)
if tap == "" {
tap = strings.TrimSpace(vm.Runtime.TapDevice)
}
if strings.TrimSpace(vm.Runtime.GuestIP) == "" || tap == "" {
if c.logger != nil {
c.logger.Debug("skipping nat cleanup without runtime network handles", append(vmLogAttrs(vm), "guest_ip", vm.Runtime.GuestIP, "tap_device", tap)...)
}
return nil
}
return c.net.ensureNAT(ctx, vm.Runtime.GuestIP, tap, false)
}
func (c natCapability) ApplyConfigChange(ctx context.Context, before, after model.VMRecord) error {
if before.Spec.NATEnabled == after.Spec.NATEnabled {
return nil
}
if !c.vm.vmAlive(after) {
return nil
}
return c.net.ensureNAT(ctx, after.Runtime.GuestIP, c.vm.vmHandles(after.ID).TapDevice, after.Spec.NATEnabled)
}
func (c natCapability) AddDoctorChecks(ctx context.Context, report *system.Report) {
checks := system.NewPreflight()
checks.RequireCommand("ip", toolHint("ip"))
c.net.addNATPrereqs(ctx, checks)
if len(checks.Problems()) > 0 {
report.Add(system.CheckStatusFail, "feature nat", checks.Problems()...)
return
}
uplink, err := c.net.defaultUplink(ctx)
if err != nil {
report.AddFail("feature nat", err.Error())
return
}
report.AddPass("feature nat", "iptables/sysctl available, uplink "+uplink)
}
func joinErr(current, next error) error {
return errors.Join(current, next)
}