daemon: persist tap device on VM.Runtime so NAT teardown survives handle-cache loss
Cleanup identity for kernel objects was split across two sources of
truth: vm.Runtime (DB-backed, durable) held paths and the guest IP,
but the TAP name lived only in the in-process handle cache + the
best-effort handles.json scratch file next to the VM dir. Every
other cleanup-identifying datum has a fallback — firecracker PID
can be rediscovered via `pgrep -f <apiSock>`, loops via losetup, dm
name from the deterministic ShortID(vm.ID). The tap is the one
truly cache-only datum (allocated from a pool, not derivable).
That made NAT teardown fragile:
- daemon crash between `acquireTap` and the handles.json write
- handles.json corrupt on the next daemon start
- partial cleanup that already zeroed the cache
In any of those cases natCapability.Cleanup short-circuited
("skipping nat cleanup without runtime network handles") and the
per-VM POSTROUTING MASQUERADE + the two FORWARD rules keyed off
the tap would leak. The VM row in the DB still existed, so a retry
couldn't close the loop — the tap name was simply gone.
Fix: mirror TapDevice onto model.VMRuntime (serialised via the
existing runtime_json column, omitempty so existing rows upgrade
cleanly). Set it in startVMLocked right next to the
s.setVMHandles call that seeds the in-memory cache; clear it at
every post-cleanup reset site (stop normal path + stop stale
branch, kill normal path + kill stale branch, cleanupOnErr in
start, reconcile's stale-vm branch, the stats poller's auto-stop
path).
Fallbacks now cascade:
- natCapability.Cleanup: handles cache → Runtime.TapDevice
- cleanupRuntime (releaseTap): handles cache → Runtime.TapDevice
Both surfaces refuse gracefully (old behaviour) only when neither
source has a value, which really does mean "no tap was ever
allocated for this VM" rather than "we lost track of it."
Test: TestNATCapabilityCleanup_FallsBackToRuntimeTapDevice clears
the handle cache, sets vm.Runtime.TapDevice, and asserts Cleanup
reaches the runner — the exact scenario the review flagged as a
plausible leak and the exact code path that now guarantees it
doesn't.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
1850904d9c
commit
5eceebe49f
7 changed files with 72 additions and 16 deletions
|
|
@ -326,8 +326,14 @@ func (c natCapability) Cleanup(ctx context.Context, vm model.VMRecord) error {
|
||||||
if !vm.Spec.NATEnabled {
|
if !vm.Spec.NATEnabled {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
tap := c.vm.vmHandles(vm.ID).TapDevice
|
// Handle cache is volatile across daemon restarts; Runtime is
|
||||||
if strings.TrimSpace(vm.Runtime.GuestIP) == "" || strings.TrimSpace(tap) == "" {
|
// the persisted DB-backed copy. Fall back so a crash / corrupt
|
||||||
|
// handles.json doesn't leak iptables rules keyed off the tap.
|
||||||
|
tap := strings.TrimSpace(c.vm.vmHandles(vm.ID).TapDevice)
|
||||||
|
if tap == "" {
|
||||||
|
tap = strings.TrimSpace(vm.Runtime.TapDevice)
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(vm.Runtime.GuestIP) == "" || tap == "" {
|
||||||
if c.logger != nil {
|
if c.logger != nil {
|
||||||
c.logger.Debug("skipping nat cleanup without runtime network handles", append(vmLogAttrs(vm), "guest_ip", vm.Runtime.GuestIP, "tap_device", tap)...)
|
c.logger.Debug("skipping nat cleanup without runtime network handles", append(vmLogAttrs(vm), "guest_ip", vm.Runtime.GuestIP, "tap_device", tap)...)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -552,6 +552,7 @@ func (d *Daemon) reconcile(ctx context.Context) error {
|
||||||
_ = d.vm.cleanupRuntime(ctx, vm, true)
|
_ = d.vm.cleanupRuntime(ctx, vm, true)
|
||||||
vm.State = model.VMStateStopped
|
vm.State = model.VMStateStopped
|
||||||
vm.Runtime.State = model.VMStateStopped
|
vm.Runtime.State = model.VMStateStopped
|
||||||
|
vm.Runtime.TapDevice = ""
|
||||||
d.vm.clearVMHandles(vm)
|
d.vm.clearVMHandles(vm)
|
||||||
vm.UpdatedAt = model.Now()
|
vm.UpdatedAt = model.Now()
|
||||||
return d.store.UpsertVM(ctx, vm)
|
return d.store.UpsertVM(ctx, vm)
|
||||||
|
|
|
||||||
|
|
@ -174,3 +174,26 @@ func TestNATCapabilityCleanup_ReversesNATWhenRuntimePresent(t *testing.T) {
|
||||||
t.Fatal("runner calls = 0, want ensureNAT(false) to execute when runtime wiring exists")
|
t.Fatal("runner calls = 0, want ensureNAT(false) to execute when runtime wiring exists")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TestNATCapabilityCleanup_FallsBackToRuntimeTapDevice simulates the
|
||||||
|
// post-crash / corrupt-handles.json scenario: the in-memory handle
|
||||||
|
// cache is empty, but the DB-backed VM.Runtime still carries the
|
||||||
|
// tap name (startVMLocked persists it alongside the handle cache).
|
||||||
|
// Cleanup must use that fallback so the iptables FORWARD rules
|
||||||
|
// keyed on the tap are actually removed — if Cleanup short-circuits
|
||||||
|
// the way it did before this fix, those rules leak forever.
|
||||||
|
func TestNATCapabilityCleanup_FallsBackToRuntimeTapDevice(t *testing.T) {
|
||||||
|
f := newNATCapabilityFixture(t, true)
|
||||||
|
// Wipe the handle cache, as if the daemon had just restarted
|
||||||
|
// against a corrupt (or missing) handles.json.
|
||||||
|
f.d.vm.clearVMHandles(f.vm)
|
||||||
|
// But the VM row in the DB still has the tap recorded.
|
||||||
|
f.vm.Runtime.TapDevice = "tap-nat-42"
|
||||||
|
|
||||||
|
if err := f.cap.Cleanup(context.Background(), f.vm); err != nil {
|
||||||
|
t.Fatalf("Cleanup: %v", err)
|
||||||
|
}
|
||||||
|
if n := f.runner.total(); n == 0 {
|
||||||
|
t.Fatal("runner calls = 0, want ensureNAT(false) to execute via the Runtime.TapDevice fallback; NAT rules would leak across daemon restarts")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -82,8 +82,16 @@ func (s *VMService) cleanupRuntime(ctx context.Context, vm model.VMRecord, prese
|
||||||
})
|
})
|
||||||
featureErr := s.capHooks.cleanupState(ctx, vm)
|
featureErr := s.capHooks.cleanupState(ctx, vm)
|
||||||
var tapErr error
|
var tapErr error
|
||||||
if h.TapDevice != "" {
|
// Prefer the handle cache (fresh from startVMLocked), but fall
|
||||||
tapErr = s.net.releaseTap(ctx, h.TapDevice)
|
// back to Runtime.TapDevice — persisted to the DB in the same
|
||||||
|
// stage — so a daemon restart or corrupt handles.json doesn't
|
||||||
|
// leak the tap (or the NAT FORWARD rules keyed off it).
|
||||||
|
tap := h.TapDevice
|
||||||
|
if tap == "" {
|
||||||
|
tap = vm.Runtime.TapDevice
|
||||||
|
}
|
||||||
|
if tap != "" {
|
||||||
|
tapErr = s.net.releaseTap(ctx, tap)
|
||||||
}
|
}
|
||||||
if vm.Runtime.APISockPath != "" {
|
if vm.Runtime.APISockPath != "" {
|
||||||
_ = os.Remove(vm.Runtime.APISockPath)
|
_ = os.Remove(vm.Runtime.APISockPath)
|
||||||
|
|
|
||||||
|
|
@ -123,6 +123,7 @@ func (s *VMService) startVMLocked(ctx context.Context, vm model.VMRecord, image
|
||||||
if cleanupErr := s.cleanupRuntime(context.Background(), vm, true); cleanupErr != nil {
|
if cleanupErr := s.cleanupRuntime(context.Background(), vm, true); cleanupErr != nil {
|
||||||
err = errors.Join(err, cleanupErr)
|
err = errors.Join(err, cleanupErr)
|
||||||
}
|
}
|
||||||
|
vm.Runtime.TapDevice = ""
|
||||||
s.clearVMHandles(vm)
|
s.clearVMHandles(vm)
|
||||||
_ = s.store.UpsertVM(context.Background(), vm)
|
_ = s.store.UpsertVM(context.Background(), vm)
|
||||||
return model.VMRecord{}, err
|
return model.VMRecord{}, err
|
||||||
|
|
@ -165,6 +166,10 @@ func (s *VMService) startVMLocked(ctx context.Context, vm model.VMRecord, image
|
||||||
}
|
}
|
||||||
live.TapDevice = tap
|
live.TapDevice = tap
|
||||||
s.setVMHandles(vm, live)
|
s.setVMHandles(vm, live)
|
||||||
|
// Mirror onto VM.Runtime so NAT teardown can recover the tap
|
||||||
|
// name from the DB even if the handle cache is empty (daemon
|
||||||
|
// crash + restart, corrupt handles.json).
|
||||||
|
vm.Runtime.TapDevice = tap
|
||||||
op.stage("metrics_file", "metrics_path", vm.Runtime.MetricsPath)
|
op.stage("metrics_file", "metrics_path", vm.Runtime.MetricsPath)
|
||||||
if err := os.WriteFile(vm.Runtime.MetricsPath, nil, 0o644); err != nil {
|
if err := os.WriteFile(vm.Runtime.MetricsPath, nil, 0o644); err != nil {
|
||||||
return cleanupOnErr(err)
|
return cleanupOnErr(err)
|
||||||
|
|
@ -277,6 +282,7 @@ func (s *VMService) stopVMLocked(ctx context.Context, current model.VMRecord) (v
|
||||||
}
|
}
|
||||||
vm.State = model.VMStateStopped
|
vm.State = model.VMStateStopped
|
||||||
vm.Runtime.State = model.VMStateStopped
|
vm.Runtime.State = model.VMStateStopped
|
||||||
|
vm.Runtime.TapDevice = ""
|
||||||
s.clearVMHandles(vm)
|
s.clearVMHandles(vm)
|
||||||
if err := s.store.UpsertVM(ctx, vm); err != nil {
|
if err := s.store.UpsertVM(ctx, vm); err != nil {
|
||||||
return model.VMRecord{}, err
|
return model.VMRecord{}, err
|
||||||
|
|
@ -301,6 +307,7 @@ func (s *VMService) stopVMLocked(ctx context.Context, current model.VMRecord) (v
|
||||||
}
|
}
|
||||||
vm.State = model.VMStateStopped
|
vm.State = model.VMStateStopped
|
||||||
vm.Runtime.State = model.VMStateStopped
|
vm.Runtime.State = model.VMStateStopped
|
||||||
|
vm.Runtime.TapDevice = ""
|
||||||
s.clearVMHandles(vm)
|
s.clearVMHandles(vm)
|
||||||
system.TouchNow(&vm)
|
system.TouchNow(&vm)
|
||||||
if err := s.store.UpsertVM(ctx, vm); err != nil {
|
if err := s.store.UpsertVM(ctx, vm); err != nil {
|
||||||
|
|
@ -332,6 +339,7 @@ func (s *VMService) killVMLocked(ctx context.Context, current model.VMRecord, si
|
||||||
}
|
}
|
||||||
vm.State = model.VMStateStopped
|
vm.State = model.VMStateStopped
|
||||||
vm.Runtime.State = model.VMStateStopped
|
vm.Runtime.State = model.VMStateStopped
|
||||||
|
vm.Runtime.TapDevice = ""
|
||||||
s.clearVMHandles(vm)
|
s.clearVMHandles(vm)
|
||||||
if err := s.store.UpsertVM(ctx, vm); err != nil {
|
if err := s.store.UpsertVM(ctx, vm); err != nil {
|
||||||
return model.VMRecord{}, err
|
return model.VMRecord{}, err
|
||||||
|
|
@ -361,6 +369,7 @@ func (s *VMService) killVMLocked(ctx context.Context, current model.VMRecord, si
|
||||||
}
|
}
|
||||||
vm.State = model.VMStateStopped
|
vm.State = model.VMStateStopped
|
||||||
vm.Runtime.State = model.VMStateStopped
|
vm.Runtime.State = model.VMStateStopped
|
||||||
|
vm.Runtime.TapDevice = ""
|
||||||
s.clearVMHandles(vm)
|
s.clearVMHandles(vm)
|
||||||
system.TouchNow(&vm)
|
system.TouchNow(&vm)
|
||||||
if err := s.store.UpsertVM(ctx, vm); err != nil {
|
if err := s.store.UpsertVM(ctx, vm); err != nil {
|
||||||
|
|
|
||||||
|
|
@ -128,6 +128,7 @@ func (s *VMService) stopStaleVMs(ctx context.Context) (err error) {
|
||||||
_ = s.cleanupRuntime(ctx, vm, true)
|
_ = s.cleanupRuntime(ctx, vm, true)
|
||||||
vm.State = model.VMStateStopped
|
vm.State = model.VMStateStopped
|
||||||
vm.Runtime.State = model.VMStateStopped
|
vm.Runtime.State = model.VMStateStopped
|
||||||
|
vm.Runtime.TapDevice = ""
|
||||||
s.clearVMHandles(vm)
|
s.clearVMHandles(vm)
|
||||||
vm.UpdatedAt = model.Now()
|
vm.UpdatedAt = model.Now()
|
||||||
return s.store.UpsertVM(ctx, vm)
|
return s.store.UpsertVM(ctx, vm)
|
||||||
|
|
|
||||||
|
|
@ -101,18 +101,26 @@ type VMSpec struct {
|
||||||
// LastError carries the last failure message for debugging. State
|
// LastError carries the last failure message for debugging. State
|
||||||
// mirrors VMRecord.State.
|
// mirrors VMRecord.State.
|
||||||
type VMRuntime struct {
|
type VMRuntime struct {
|
||||||
State VMState `json:"state"`
|
State VMState `json:"state"`
|
||||||
GuestIP string `json:"guest_ip"`
|
GuestIP string `json:"guest_ip"`
|
||||||
APISockPath string `json:"api_sock_path,omitempty"`
|
APISockPath string `json:"api_sock_path,omitempty"`
|
||||||
VSockPath string `json:"vsock_path,omitempty"`
|
VSockPath string `json:"vsock_path,omitempty"`
|
||||||
VSockCID uint32 `json:"vsock_cid,omitempty"`
|
VSockCID uint32 `json:"vsock_cid,omitempty"`
|
||||||
LogPath string `json:"log_path,omitempty"`
|
LogPath string `json:"log_path,omitempty"`
|
||||||
MetricsPath string `json:"metrics_path,omitempty"`
|
MetricsPath string `json:"metrics_path,omitempty"`
|
||||||
DNSName string `json:"dns_name,omitempty"`
|
DNSName string `json:"dns_name,omitempty"`
|
||||||
VMDir string `json:"vm_dir"`
|
VMDir string `json:"vm_dir"`
|
||||||
SystemOverlay string `json:"system_overlay_path"`
|
// TapDevice mirrors VMHandles.TapDevice but persists across
|
||||||
WorkDiskPath string `json:"work_disk_path"`
|
// daemon restarts / handle-cache loss. NAT teardown needs the
|
||||||
LastError string `json:"last_error,omitempty"`
|
// exact tap name to delete the FORWARD rules; if we only had
|
||||||
|
// the handle cache, a crash between tap acquire and handles.json
|
||||||
|
// write — or a corrupt handles.json on the next daemon start —
|
||||||
|
// would silently leak the rules. Storing it on the VM record
|
||||||
|
// makes cleanup correct as long as the VM row exists.
|
||||||
|
TapDevice string `json:"tap_device,omitempty"`
|
||||||
|
SystemOverlay string `json:"system_overlay_path"`
|
||||||
|
WorkDiskPath string `json:"work_disk_path"`
|
||||||
|
LastError string `json:"last_error,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type VMStats struct {
|
type VMStats struct {
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue