From 5eceebe49f6861c627c1564e309270d1eedd8630 Mon Sep 17 00:00:00 2001 From: Thales Maciel Date: Thu, 23 Apr 2026 14:21:13 -0300 Subject: [PATCH] daemon: persist tap device on VM.Runtime so NAT teardown survives handle-cache loss MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cleanup identity for kernel objects was split across two sources of truth: vm.Runtime (DB-backed, durable) held paths and the guest IP, but the TAP name lived only in the in-process handle cache + the best-effort handles.json scratch file next to the VM dir. Every other cleanup-identifying datum has a fallback — firecracker PID can be rediscovered via `pgrep -f `, loops via losetup, dm name from the deterministic ShortID(vm.ID). The tap is the one truly cache-only datum (allocated from a pool, not derivable). That made NAT teardown fragile: - daemon crash between `acquireTap` and the handles.json write - handles.json corrupt on the next daemon start - partial cleanup that already zeroed the cache In any of those cases natCapability.Cleanup short-circuited ("skipping nat cleanup without runtime network handles") and the per-VM POSTROUTING MASQUERADE + the two FORWARD rules keyed off the tap would leak. The VM row in the DB still existed, so a retry couldn't close the loop — the tap name was simply gone. Fix: mirror TapDevice onto model.VMRuntime (serialised via the existing runtime_json column, omitempty so existing rows upgrade cleanly). Set it in startVMLocked right next to the s.setVMHandles call that seeds the in-memory cache; clear it at every post-cleanup reset site (stop normal path + stop stale branch, kill normal path + kill stale branch, cleanupOnErr in start, reconcile's stale-vm branch, the stats poller's auto-stop path). Fallbacks now cascade: - natCapability.Cleanup: handles cache → Runtime.TapDevice - cleanupRuntime (releaseTap): handles cache → Runtime.TapDevice Both surfaces refuse gracefully (old behaviour) only when neither source has a value, which really does mean "no tap was ever allocated for this VM" rather than "we lost track of it." Test: TestNATCapabilityCleanup_FallsBackToRuntimeTapDevice clears the handle cache, sets vm.Runtime.TapDevice, and asserts Cleanup reaches the runner — the exact scenario the review flagged as a plausible leak and the exact code path that now guarantees it doesn't. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/daemon/capabilities.go | 10 ++++++-- internal/daemon/daemon.go | 1 + internal/daemon/nat_capability_test.go | 23 ++++++++++++++++++ internal/daemon/vm.go | 12 ++++++++-- internal/daemon/vm_lifecycle.go | 9 ++++++++ internal/daemon/vm_stats.go | 1 + internal/model/types.go | 32 ++++++++++++++++---------- 7 files changed, 72 insertions(+), 16 deletions(-) diff --git a/internal/daemon/capabilities.go b/internal/daemon/capabilities.go index 319df39..d4037c2 100644 --- a/internal/daemon/capabilities.go +++ b/internal/daemon/capabilities.go @@ -326,8 +326,14 @@ func (c natCapability) Cleanup(ctx context.Context, vm model.VMRecord) error { if !vm.Spec.NATEnabled { return nil } - tap := c.vm.vmHandles(vm.ID).TapDevice - if strings.TrimSpace(vm.Runtime.GuestIP) == "" || strings.TrimSpace(tap) == "" { + // Handle cache is volatile across daemon restarts; Runtime is + // the persisted DB-backed copy. Fall back so a crash / corrupt + // handles.json doesn't leak iptables rules keyed off the tap. + tap := strings.TrimSpace(c.vm.vmHandles(vm.ID).TapDevice) + if tap == "" { + tap = strings.TrimSpace(vm.Runtime.TapDevice) + } + if strings.TrimSpace(vm.Runtime.GuestIP) == "" || tap == "" { if c.logger != nil { c.logger.Debug("skipping nat cleanup without runtime network handles", append(vmLogAttrs(vm), "guest_ip", vm.Runtime.GuestIP, "tap_device", tap)...) } diff --git a/internal/daemon/daemon.go b/internal/daemon/daemon.go index 68eb77c..6ef0375 100644 --- a/internal/daemon/daemon.go +++ b/internal/daemon/daemon.go @@ -552,6 +552,7 @@ func (d *Daemon) reconcile(ctx context.Context) error { _ = d.vm.cleanupRuntime(ctx, vm, true) vm.State = model.VMStateStopped vm.Runtime.State = model.VMStateStopped + vm.Runtime.TapDevice = "" d.vm.clearVMHandles(vm) vm.UpdatedAt = model.Now() return d.store.UpsertVM(ctx, vm) diff --git a/internal/daemon/nat_capability_test.go b/internal/daemon/nat_capability_test.go index 0ab18d9..f25ea1a 100644 --- a/internal/daemon/nat_capability_test.go +++ b/internal/daemon/nat_capability_test.go @@ -174,3 +174,26 @@ func TestNATCapabilityCleanup_ReversesNATWhenRuntimePresent(t *testing.T) { t.Fatal("runner calls = 0, want ensureNAT(false) to execute when runtime wiring exists") } } + +// TestNATCapabilityCleanup_FallsBackToRuntimeTapDevice simulates the +// post-crash / corrupt-handles.json scenario: the in-memory handle +// cache is empty, but the DB-backed VM.Runtime still carries the +// tap name (startVMLocked persists it alongside the handle cache). +// Cleanup must use that fallback so the iptables FORWARD rules +// keyed on the tap are actually removed — if Cleanup short-circuits +// the way it did before this fix, those rules leak forever. +func TestNATCapabilityCleanup_FallsBackToRuntimeTapDevice(t *testing.T) { + f := newNATCapabilityFixture(t, true) + // Wipe the handle cache, as if the daemon had just restarted + // against a corrupt (or missing) handles.json. + f.d.vm.clearVMHandles(f.vm) + // But the VM row in the DB still has the tap recorded. + f.vm.Runtime.TapDevice = "tap-nat-42" + + if err := f.cap.Cleanup(context.Background(), f.vm); err != nil { + t.Fatalf("Cleanup: %v", err) + } + if n := f.runner.total(); n == 0 { + t.Fatal("runner calls = 0, want ensureNAT(false) to execute via the Runtime.TapDevice fallback; NAT rules would leak across daemon restarts") + } +} diff --git a/internal/daemon/vm.go b/internal/daemon/vm.go index 3bdff67..b9d4f83 100644 --- a/internal/daemon/vm.go +++ b/internal/daemon/vm.go @@ -82,8 +82,16 @@ func (s *VMService) cleanupRuntime(ctx context.Context, vm model.VMRecord, prese }) featureErr := s.capHooks.cleanupState(ctx, vm) var tapErr error - if h.TapDevice != "" { - tapErr = s.net.releaseTap(ctx, h.TapDevice) + // Prefer the handle cache (fresh from startVMLocked), but fall + // back to Runtime.TapDevice — persisted to the DB in the same + // stage — so a daemon restart or corrupt handles.json doesn't + // leak the tap (or the NAT FORWARD rules keyed off it). + tap := h.TapDevice + if tap == "" { + tap = vm.Runtime.TapDevice + } + if tap != "" { + tapErr = s.net.releaseTap(ctx, tap) } if vm.Runtime.APISockPath != "" { _ = os.Remove(vm.Runtime.APISockPath) diff --git a/internal/daemon/vm_lifecycle.go b/internal/daemon/vm_lifecycle.go index 0190a41..6d4ac45 100644 --- a/internal/daemon/vm_lifecycle.go +++ b/internal/daemon/vm_lifecycle.go @@ -123,6 +123,7 @@ func (s *VMService) startVMLocked(ctx context.Context, vm model.VMRecord, image if cleanupErr := s.cleanupRuntime(context.Background(), vm, true); cleanupErr != nil { err = errors.Join(err, cleanupErr) } + vm.Runtime.TapDevice = "" s.clearVMHandles(vm) _ = s.store.UpsertVM(context.Background(), vm) return model.VMRecord{}, err @@ -165,6 +166,10 @@ func (s *VMService) startVMLocked(ctx context.Context, vm model.VMRecord, image } live.TapDevice = tap s.setVMHandles(vm, live) + // Mirror onto VM.Runtime so NAT teardown can recover the tap + // name from the DB even if the handle cache is empty (daemon + // crash + restart, corrupt handles.json). + vm.Runtime.TapDevice = tap op.stage("metrics_file", "metrics_path", vm.Runtime.MetricsPath) if err := os.WriteFile(vm.Runtime.MetricsPath, nil, 0o644); err != nil { return cleanupOnErr(err) @@ -277,6 +282,7 @@ func (s *VMService) stopVMLocked(ctx context.Context, current model.VMRecord) (v } vm.State = model.VMStateStopped vm.Runtime.State = model.VMStateStopped + vm.Runtime.TapDevice = "" s.clearVMHandles(vm) if err := s.store.UpsertVM(ctx, vm); err != nil { return model.VMRecord{}, err @@ -301,6 +307,7 @@ func (s *VMService) stopVMLocked(ctx context.Context, current model.VMRecord) (v } vm.State = model.VMStateStopped vm.Runtime.State = model.VMStateStopped + vm.Runtime.TapDevice = "" s.clearVMHandles(vm) system.TouchNow(&vm) if err := s.store.UpsertVM(ctx, vm); err != nil { @@ -332,6 +339,7 @@ func (s *VMService) killVMLocked(ctx context.Context, current model.VMRecord, si } vm.State = model.VMStateStopped vm.Runtime.State = model.VMStateStopped + vm.Runtime.TapDevice = "" s.clearVMHandles(vm) if err := s.store.UpsertVM(ctx, vm); err != nil { return model.VMRecord{}, err @@ -361,6 +369,7 @@ func (s *VMService) killVMLocked(ctx context.Context, current model.VMRecord, si } vm.State = model.VMStateStopped vm.Runtime.State = model.VMStateStopped + vm.Runtime.TapDevice = "" s.clearVMHandles(vm) system.TouchNow(&vm) if err := s.store.UpsertVM(ctx, vm); err != nil { diff --git a/internal/daemon/vm_stats.go b/internal/daemon/vm_stats.go index 61c43df..62e8d85 100644 --- a/internal/daemon/vm_stats.go +++ b/internal/daemon/vm_stats.go @@ -128,6 +128,7 @@ func (s *VMService) stopStaleVMs(ctx context.Context) (err error) { _ = s.cleanupRuntime(ctx, vm, true) vm.State = model.VMStateStopped vm.Runtime.State = model.VMStateStopped + vm.Runtime.TapDevice = "" s.clearVMHandles(vm) vm.UpdatedAt = model.Now() return s.store.UpsertVM(ctx, vm) diff --git a/internal/model/types.go b/internal/model/types.go index 6c04034..940a815 100644 --- a/internal/model/types.go +++ b/internal/model/types.go @@ -101,18 +101,26 @@ type VMSpec struct { // LastError carries the last failure message for debugging. State // mirrors VMRecord.State. type VMRuntime struct { - State VMState `json:"state"` - GuestIP string `json:"guest_ip"` - APISockPath string `json:"api_sock_path,omitempty"` - VSockPath string `json:"vsock_path,omitempty"` - VSockCID uint32 `json:"vsock_cid,omitempty"` - LogPath string `json:"log_path,omitempty"` - MetricsPath string `json:"metrics_path,omitempty"` - DNSName string `json:"dns_name,omitempty"` - VMDir string `json:"vm_dir"` - SystemOverlay string `json:"system_overlay_path"` - WorkDiskPath string `json:"work_disk_path"` - LastError string `json:"last_error,omitempty"` + State VMState `json:"state"` + GuestIP string `json:"guest_ip"` + APISockPath string `json:"api_sock_path,omitempty"` + VSockPath string `json:"vsock_path,omitempty"` + VSockCID uint32 `json:"vsock_cid,omitempty"` + LogPath string `json:"log_path,omitempty"` + MetricsPath string `json:"metrics_path,omitempty"` + DNSName string `json:"dns_name,omitempty"` + VMDir string `json:"vm_dir"` + // TapDevice mirrors VMHandles.TapDevice but persists across + // daemon restarts / handle-cache loss. NAT teardown needs the + // exact tap name to delete the FORWARD rules; if we only had + // the handle cache, a crash between tap acquire and handles.json + // write — or a corrupt handles.json on the next daemon start — + // would silently leak the rules. Storing it on the VM record + // makes cleanup correct as long as the VM row exists. + TapDevice string `json:"tap_device,omitempty"` + SystemOverlay string `json:"system_overlay_path"` + WorkDiskPath string `json:"work_disk_path"` + LastError string `json:"last_error,omitempty"` } type VMStats struct {