daemon: persist tap device on VM.Runtime so NAT teardown survives handle-cache loss

Cleanup identity for kernel objects was split across two sources of truth: vm.Runtime (DB-backed, durable) held paths and the guest IP, but the TAP name lived only in the in-process handle cache + the best-effort handles.json scratch file next to the VM dir. Every other cleanup-identifying datum has a fallback — firecracker PID can be rediscovered via `pgrep -f <apiSock>`, loops via losetup, dm name from the deterministic ShortID(vm.ID). The tap is the one truly cache-only datum (allocated from a pool, not derivable). That made NAT teardown fragile: - daemon crash between `acquireTap` and the handles.json write - handles.json corrupt on the next daemon start - partial cleanup that already zeroed the cache In any of those cases natCapability.Cleanup short-circuited ("skipping nat cleanup without runtime network handles") and the per-VM POSTROUTING MASQUERADE + the two FORWARD rules keyed off the tap would leak. The VM row in the DB still existed, so a retry couldn't close the loop — the tap name was simply gone. Fix: mirror TapDevice onto model.VMRuntime (serialised via the existing runtime_json column, omitempty so existing rows upgrade cleanly). Set it in startVMLocked right next to the s.setVMHandles call that seeds the in-memory cache; clear it at every post-cleanup reset site (stop normal path + stop stale branch, kill normal path + kill stale branch, cleanupOnErr in start, reconcile's stale-vm branch, the stats poller's auto-stop path). Fallbacks now cascade: - natCapability.Cleanup: handles cache → Runtime.TapDevice - cleanupRuntime (releaseTap): handles cache → Runtime.TapDevice Both surfaces refuse gracefully (old behaviour) only when neither source has a value, which really does mean "no tap was ever allocated for this VM" rather than "we lost track of it." Test: TestNATCapabilityCleanup_FallsBackToRuntimeTapDevice clears the handle cache, sets vm.Runtime.TapDevice, and asserts Cleanup reaches the runner — the exact scenario the review flagged as a plausible leak and the exact code path that now guarantees it doesn't. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-23 14:21:13 -03:00 · 2026-04-23 14:21:13 -03:00 · 5eceebe49f
commit 5eceebe49f
parent 1850904d9c
7 changed files with 72 additions and 16 deletions
--- a/internal/daemon/capabilities.go
+++ b/internal/daemon/capabilities.go
@ -326,8 +326,14 @@ func (c natCapability) Cleanup(ctx context.Context, vm model.VMRecord) error {
 	if !vm.Spec.NATEnabled {
 		return nil
 	}
-	tap := c.vm.vmHandles(vm.ID).TapDevice
-	if strings.TrimSpace(vm.Runtime.GuestIP) == "" || strings.TrimSpace(tap) == "" {
+	// Handle cache is volatile across daemon restarts; Runtime is
+	// the persisted DB-backed copy. Fall back so a crash / corrupt
+	// handles.json doesn't leak iptables rules keyed off the tap.
+	tap := strings.TrimSpace(c.vm.vmHandles(vm.ID).TapDevice)
+	if tap == "" {
+		tap = strings.TrimSpace(vm.Runtime.TapDevice)
+	}
+	if strings.TrimSpace(vm.Runtime.GuestIP) == "" || tap == "" {
 		if c.logger != nil {
 			c.logger.Debug("skipping nat cleanup without runtime network handles", append(vmLogAttrs(vm), "guest_ip", vm.Runtime.GuestIP, "tap_device", tap)...)
 		}
--- a/internal/daemon/daemon.go
+++ b/internal/daemon/daemon.go
@ -552,6 +552,7 @@ func (d *Daemon) reconcile(ctx context.Context) error {
 			_ = d.vm.cleanupRuntime(ctx, vm, true)
 			vm.State = model.VMStateStopped
 			vm.Runtime.State = model.VMStateStopped
+			vm.Runtime.TapDevice = ""
 			d.vm.clearVMHandles(vm)
 			vm.UpdatedAt = model.Now()
 			return d.store.UpsertVM(ctx, vm)
--- a/internal/daemon/nat_capability_test.go
+++ b/internal/daemon/nat_capability_test.go
@ -174,3 +174,26 @@ func TestNATCapabilityCleanup_ReversesNATWhenRuntimePresent(t *testing.T) {
 		t.Fatal("runner calls = 0, want ensureNAT(false) to execute when runtime wiring exists")
 	}
 }
+
+// TestNATCapabilityCleanup_FallsBackToRuntimeTapDevice simulates the
+// post-crash / corrupt-handles.json scenario: the in-memory handle
+// cache is empty, but the DB-backed VM.Runtime still carries the
+// tap name (startVMLocked persists it alongside the handle cache).
+// Cleanup must use that fallback so the iptables FORWARD rules
+// keyed on the tap are actually removed — if Cleanup short-circuits
+// the way it did before this fix, those rules leak forever.
+func TestNATCapabilityCleanup_FallsBackToRuntimeTapDevice(t *testing.T) {
+	f := newNATCapabilityFixture(t, true)
+	// Wipe the handle cache, as if the daemon had just restarted
+	// against a corrupt (or missing) handles.json.
+	f.d.vm.clearVMHandles(f.vm)
+	// But the VM row in the DB still has the tap recorded.
+	f.vm.Runtime.TapDevice = "tap-nat-42"
+
+	if err := f.cap.Cleanup(context.Background(), f.vm); err != nil {
+		t.Fatalf("Cleanup: %v", err)
+	}
+	if n := f.runner.total(); n == 0 {
+		t.Fatal("runner calls = 0, want ensureNAT(false) to execute via the Runtime.TapDevice fallback; NAT rules would leak across daemon restarts")
+	}
+}
--- a/internal/daemon/vm.go
+++ b/internal/daemon/vm.go
@ -82,8 +82,16 @@ func (s *VMService) cleanupRuntime(ctx context.Context, vm model.VMRecord, prese
 	})
 	featureErr := s.capHooks.cleanupState(ctx, vm)
 	var tapErr error
-	if h.TapDevice != "" {
-		tapErr = s.net.releaseTap(ctx, h.TapDevice)
+	// Prefer the handle cache (fresh from startVMLocked), but fall
+	// back to Runtime.TapDevice — persisted to the DB in the same
+	// stage — so a daemon restart or corrupt handles.json doesn't
+	// leak the tap (or the NAT FORWARD rules keyed off it).
+	tap := h.TapDevice
+	if tap == "" {
+		tap = vm.Runtime.TapDevice
+	}
+	if tap != "" {
+		tapErr = s.net.releaseTap(ctx, tap)
 	}
 	if vm.Runtime.APISockPath != "" {
 		_ = os.Remove(vm.Runtime.APISockPath)
--- a/internal/daemon/vm_lifecycle.go
+++ b/internal/daemon/vm_lifecycle.go
@ -123,6 +123,7 @@ func (s *VMService) startVMLocked(ctx context.Context, vm model.VMRecord, image
 		if cleanupErr := s.cleanupRuntime(context.Background(), vm, true); cleanupErr != nil {
 			err = errors.Join(err, cleanupErr)
 		}
+		vm.Runtime.TapDevice = ""
 		s.clearVMHandles(vm)
 		_ = s.store.UpsertVM(context.Background(), vm)
 		return model.VMRecord{}, err
@ -165,6 +166,10 @@ func (s *VMService) startVMLocked(ctx context.Context, vm model.VMRecord, image
 	}
 	live.TapDevice = tap
 	s.setVMHandles(vm, live)
+	// Mirror onto VM.Runtime so NAT teardown can recover the tap
+	// name from the DB even if the handle cache is empty (daemon
+	// crash + restart, corrupt handles.json).
+	vm.Runtime.TapDevice = tap
 	op.stage("metrics_file", "metrics_path", vm.Runtime.MetricsPath)
 	if err := os.WriteFile(vm.Runtime.MetricsPath, nil, 0o644); err != nil {
 		return cleanupOnErr(err)
@ -277,6 +282,7 @@ func (s *VMService) stopVMLocked(ctx context.Context, current model.VMRecord) (v
 		}
 		vm.State = model.VMStateStopped
 		vm.Runtime.State = model.VMStateStopped
+		vm.Runtime.TapDevice = ""
 		s.clearVMHandles(vm)
 		if err := s.store.UpsertVM(ctx, vm); err != nil {
 			return model.VMRecord{}, err
@ -301,6 +307,7 @@ func (s *VMService) stopVMLocked(ctx context.Context, current model.VMRecord) (v
 	}
 	vm.State = model.VMStateStopped
 	vm.Runtime.State = model.VMStateStopped
+	vm.Runtime.TapDevice = ""
 	s.clearVMHandles(vm)
 	system.TouchNow(&vm)
 	if err := s.store.UpsertVM(ctx, vm); err != nil {
@ -332,6 +339,7 @@ func (s *VMService) killVMLocked(ctx context.Context, current model.VMRecord, si
 		}
 		vm.State = model.VMStateStopped
 		vm.Runtime.State = model.VMStateStopped
+		vm.Runtime.TapDevice = ""
 		s.clearVMHandles(vm)
 		if err := s.store.UpsertVM(ctx, vm); err != nil {
 			return model.VMRecord{}, err
@ -361,6 +369,7 @@ func (s *VMService) killVMLocked(ctx context.Context, current model.VMRecord, si
 	}
 	vm.State = model.VMStateStopped
 	vm.Runtime.State = model.VMStateStopped
+	vm.Runtime.TapDevice = ""
 	s.clearVMHandles(vm)
 	system.TouchNow(&vm)
 	if err := s.store.UpsertVM(ctx, vm); err != nil {
--- a/internal/daemon/vm_stats.go
+++ b/internal/daemon/vm_stats.go
@ -128,6 +128,7 @@ func (s *VMService) stopStaleVMs(ctx context.Context) (err error) {
 			_ = s.cleanupRuntime(ctx, vm, true)
 			vm.State = model.VMStateStopped
 			vm.Runtime.State = model.VMStateStopped
+			vm.Runtime.TapDevice = ""
 			s.clearVMHandles(vm)
 			vm.UpdatedAt = model.Now()
 			return s.store.UpsertVM(ctx, vm)
--- a/internal/model/types.go
+++ b/internal/model/types.go
@ -101,18 +101,26 @@ type VMSpec struct {
 // LastError carries the last failure message for debugging. State
 // mirrors VMRecord.State.
 type VMRuntime struct {
-	State         VMState `json:"state"`
-	GuestIP       string  `json:"guest_ip"`
-	APISockPath   string  `json:"api_sock_path,omitempty"`
-	VSockPath     string  `json:"vsock_path,omitempty"`
-	VSockCID      uint32  `json:"vsock_cid,omitempty"`
-	LogPath       string  `json:"log_path,omitempty"`
-	MetricsPath   string  `json:"metrics_path,omitempty"`
-	DNSName       string  `json:"dns_name,omitempty"`
-	VMDir         string  `json:"vm_dir"`
-	SystemOverlay string  `json:"system_overlay_path"`
-	WorkDiskPath  string  `json:"work_disk_path"`
-	LastError     string  `json:"last_error,omitempty"`
+	State       VMState `json:"state"`
+	GuestIP     string  `json:"guest_ip"`
+	APISockPath string  `json:"api_sock_path,omitempty"`
+	VSockPath   string  `json:"vsock_path,omitempty"`
+	VSockCID    uint32  `json:"vsock_cid,omitempty"`
+	LogPath     string  `json:"log_path,omitempty"`
+	MetricsPath string  `json:"metrics_path,omitempty"`
+	DNSName     string  `json:"dns_name,omitempty"`
+	VMDir       string  `json:"vm_dir"`
+	// TapDevice mirrors VMHandles.TapDevice but persists across
+	// daemon restarts / handle-cache loss. NAT teardown needs the
+	// exact tap name to delete the FORWARD rules; if we only had
+	// the handle cache, a crash between tap acquire and handles.json
+	// write — or a corrupt handles.json on the next daemon start —
+	// would silently leak the rules. Storing it on the VM record
+	// makes cleanup correct as long as the VM row exists.
+	TapDevice     string `json:"tap_device,omitempty"`
+	SystemOverlay string `json:"system_overlay_path"`
+	WorkDiskPath  string `json:"work_disk_path"`
+	LastError     string `json:"last_error,omitempty"`
 }

 type VMStats struct {