Cleanup identity for kernel objects was split across two sources of
truth: vm.Runtime (DB-backed, durable) held paths and the guest IP,
but the TAP name lived only in the in-process handle cache + the
best-effort handles.json scratch file next to the VM dir. Every
other cleanup-identifying datum has a fallback — firecracker PID
can be rediscovered via `pgrep -f <apiSock>`, loops via losetup, dm
name from the deterministic ShortID(vm.ID). The tap is the one
truly cache-only datum (allocated from a pool, not derivable).
That made NAT teardown fragile:
- daemon crash between `acquireTap` and the handles.json write
- handles.json corrupt on the next daemon start
- partial cleanup that already zeroed the cache
In any of those cases natCapability.Cleanup short-circuited
("skipping nat cleanup without runtime network handles") and the
per-VM POSTROUTING MASQUERADE + the two FORWARD rules keyed off
the tap would leak. The VM row in the DB still existed, so a retry
couldn't close the loop — the tap name was simply gone.
Fix: mirror TapDevice onto model.VMRuntime (serialised via the
existing runtime_json column, omitempty so existing rows upgrade
cleanly). Set it in startVMLocked right next to the
s.setVMHandles call that seeds the in-memory cache; clear it at
every post-cleanup reset site (stop normal path + stop stale
branch, kill normal path + kill stale branch, cleanupOnErr in
start, reconcile's stale-vm branch, the stats poller's auto-stop
path).
Fallbacks now cascade:
- natCapability.Cleanup: handles cache → Runtime.TapDevice
- cleanupRuntime (releaseTap): handles cache → Runtime.TapDevice
Both surfaces refuse gracefully (old behaviour) only when neither
source has a value, which really does mean "no tap was ever
allocated for this VM" rather than "we lost track of it."
Test: TestNATCapabilityCleanup_FallsBackToRuntimeTapDevice clears
the handle cache, sets vm.Runtime.TapDevice, and asserts Cleanup
reaches the runner — the exact scenario the review flagged as a
plausible leak and the exact code path that now guarantees it
doesn't.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
143 lines
4.1 KiB
Go
143 lines
4.1 KiB
Go
package daemon
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"os"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
"banger/internal/daemon/fcproc"
|
|
"banger/internal/model"
|
|
"banger/internal/namegen"
|
|
"banger/internal/system"
|
|
"banger/internal/vmdns"
|
|
)
|
|
|
|
// Cross-service constants. Kept in vm.go because both lifecycle
|
|
// (VMService) and networking (HostNetwork) reference them; moving
|
|
// them to either owner would read as a layering violation.
|
|
var (
|
|
errWaitForExitTimeout = fcproc.ErrWaitForExitTimeout
|
|
gracefulShutdownWait = 10 * time.Second
|
|
vsockReadyWait = 30 * time.Second
|
|
vsockReadyPoll = 200 * time.Millisecond
|
|
)
|
|
|
|
// rebuildDNS enumerates live VMs and republishes the DNS record set.
|
|
// Lives on VMService because "alive" is a VM-state concern that
|
|
// HostNetwork shouldn't need to reach into. VMService orchestrates:
|
|
// VM list from the store, alive filter, hand the resulting map to
|
|
// HostNetwork.replaceDNS.
|
|
func (s *VMService) rebuildDNS(ctx context.Context) error {
|
|
if s.net == nil {
|
|
return nil
|
|
}
|
|
vms, err := s.store.ListVMs(ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
records := make(map[string]string)
|
|
for _, vm := range vms {
|
|
if !s.vmAlive(vm) {
|
|
continue
|
|
}
|
|
if strings.TrimSpace(vm.Runtime.GuestIP) == "" {
|
|
continue
|
|
}
|
|
records[vmdns.RecordName(vm.Name)] = vm.Runtime.GuestIP
|
|
}
|
|
return s.net.replaceDNS(records)
|
|
}
|
|
|
|
// cleanupRuntime tears down the host-side state for a VM: firecracker
|
|
// process, DM snapshot, capabilities, tap, sockets. Lives on VMService
|
|
// because it reaches into handles (VMService-owned); the capability
|
|
// teardown goes through the capHooks seam to keep Daemon out of the
|
|
// dependency chain.
|
|
func (s *VMService) cleanupRuntime(ctx context.Context, vm model.VMRecord, preserveDisks bool) error {
|
|
if s.logger != nil {
|
|
s.logger.Debug("cleanup runtime", append(vmLogAttrs(vm), "preserve_disks", preserveDisks)...)
|
|
}
|
|
h := s.vmHandles(vm.ID)
|
|
cleanupPID := h.PID
|
|
if vm.Runtime.APISockPath != "" {
|
|
if pid, err := s.net.findFirecrackerPID(ctx, vm.Runtime.APISockPath); err == nil && pid > 0 {
|
|
cleanupPID = pid
|
|
}
|
|
}
|
|
if cleanupPID > 0 && system.ProcessRunning(cleanupPID, vm.Runtime.APISockPath) {
|
|
_ = s.net.killVMProcess(ctx, cleanupPID)
|
|
if err := s.net.waitForExit(ctx, cleanupPID, vm.Runtime.APISockPath, 30*time.Second); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
snapshotErr := s.net.cleanupDMSnapshot(ctx, dmSnapshotHandles{
|
|
BaseLoop: h.BaseLoop,
|
|
COWLoop: h.COWLoop,
|
|
DMName: h.DMName,
|
|
DMDev: h.DMDev,
|
|
})
|
|
featureErr := s.capHooks.cleanupState(ctx, vm)
|
|
var tapErr error
|
|
// Prefer the handle cache (fresh from startVMLocked), but fall
|
|
// back to Runtime.TapDevice — persisted to the DB in the same
|
|
// stage — so a daemon restart or corrupt handles.json doesn't
|
|
// leak the tap (or the NAT FORWARD rules keyed off it).
|
|
tap := h.TapDevice
|
|
if tap == "" {
|
|
tap = vm.Runtime.TapDevice
|
|
}
|
|
if tap != "" {
|
|
tapErr = s.net.releaseTap(ctx, tap)
|
|
}
|
|
if vm.Runtime.APISockPath != "" {
|
|
_ = os.Remove(vm.Runtime.APISockPath)
|
|
}
|
|
if vm.Runtime.VSockPath != "" {
|
|
_ = os.Remove(vm.Runtime.VSockPath)
|
|
}
|
|
// The handles are only meaningful while the kernel objects exist;
|
|
// dropping them here keeps the cache in sync with reality even
|
|
// when the caller forgets to call clearVMHandles explicitly.
|
|
s.clearVMHandles(vm)
|
|
if !preserveDisks && vm.Runtime.VMDir != "" {
|
|
return errors.Join(snapshotErr, featureErr, tapErr, os.RemoveAll(vm.Runtime.VMDir))
|
|
}
|
|
return errors.Join(snapshotErr, featureErr, tapErr)
|
|
}
|
|
|
|
func (s *VMService) generateName(ctx context.Context) (string, error) {
|
|
_ = ctx
|
|
if name := strings.TrimSpace(namegen.Generate()); name != "" {
|
|
return name, nil
|
|
}
|
|
return "vm-" + strconv.FormatInt(time.Now().Unix(), 10), nil
|
|
}
|
|
|
|
func bridgePrefix(bridgeIP string) string {
|
|
parts := strings.Split(bridgeIP, ".")
|
|
if len(parts) < 3 {
|
|
return bridgeIP
|
|
}
|
|
return strings.Join(parts[:3], ".")
|
|
}
|
|
|
|
func optionalIntOrDefault(value *int, fallback int) int {
|
|
if value != nil {
|
|
return *value
|
|
}
|
|
return fallback
|
|
}
|
|
|
|
func validateOptionalPositiveSetting(label string, value *int) error {
|
|
if value == nil {
|
|
return nil
|
|
}
|
|
if *value <= 0 {
|
|
return fmt.Errorf("%s must be a positive integer", label)
|
|
}
|
|
return nil
|
|
}
|