package daemon import ( "context" "path/filepath" "sync/atomic" "testing" "time" "banger/internal/model" ) // waitForVMAlive polls until VMService.vmAlive reports true for vm or // t fails out. Bounded so a broken fake can't hang the suite. func waitForVMAlive(t *testing.T, svc *VMService, vm model.VMRecord) { t.Helper() deadline := time.Now().Add(2 * time.Second) for { if svc.vmAlive(vm) { return } if time.Now().After(deadline) { t.Fatal("fake firecracker never became alive per VMService.vmAlive") } time.Sleep(5 * time.Millisecond) } } // countingRunner records Run/RunSudo invocations without caring about // the specific commands. Good enough for tests that want to assert // "did the nat capability reach the host at all?" — hostnat.Ensure's // exact iptables/sysctl sequence is covered in the hostnat package // tests, so we don't re-enumerate it here. type countingRunner struct { runs atomic.Int32 runSudos atomic.Int32 out []byte err error } func (r *countingRunner) Run(_ context.Context, _ string, _ ...string) ([]byte, error) { r.runs.Add(1) return r.out, r.err } func (r *countingRunner) RunSudo(_ context.Context, _ ...string) ([]byte, error) { r.runSudos.Add(1) return r.out, r.err } func (r *countingRunner) total() int32 { return r.runs.Load() + r.runSudos.Load() } // natCapabilityFixture wires just enough daemon state for natCapability // tests: a HostNetwork + VMService with a countingRunner, a VM record // whose handles carry a tap device, and the capability itself. type natCapabilityFixture struct { cap natCapability runner *countingRunner d *Daemon vm model.VMRecord } func newNATCapabilityFixture(t *testing.T, natEnabled bool) natCapabilityFixture { t.Helper() runner := &countingRunner{out: []byte("default via 10.0.0.1 dev eth0 proto static\n")} d := &Daemon{ runner: runner, config: model.DaemonConfig{BridgeName: model.DefaultBridgeName}, } wireServices(d) d.net.runner = runner // A real firecracker-looking subprocess so VMService.vmAlive — which // reads /proc//cmdline and checks for "firecracker" + the api // socket path — returns true. Without this the ApplyConfigChange // "alive vs not alive" branches can't be exercised. apiSock := filepath.Join(t.TempDir(), "fc.sock") fc := startFakeFirecracker(t, apiSock) vm := testVM("natbox", "image-nat", "172.16.0.42") vm.Spec.NATEnabled = natEnabled vm.State = model.VMStateRunning vm.Runtime.State = model.VMStateRunning vm.Runtime.APISockPath = apiSock d.vm.setVMHandlesInMemory(vm.ID, model.VMHandles{ PID: fc.Process.Pid, TapDevice: "tap-nat-42", }) // startFakeFirecracker uses `exec -a firecracker ...` which renames // the process after Start returns — on a loaded CI box vmAlive can // observe the pre-exec cmdline ("bash") for a few ms and false- // negative. Poll until /proc shows the firecracker name so the // fixture hands back a VM that's definitely "alive" by banger's // rules. waitForVMAlive(t, d.vm, vm) return natCapabilityFixture{ cap: newNATCapability(d.vm, d.net, d.logger), runner: runner, d: d, vm: vm, } } func TestNATCapabilityApplyConfigChange_NoOpWhenFlagUnchanged(t *testing.T) { f := newNATCapabilityFixture(t, true) if err := f.cap.ApplyConfigChange(context.Background(), f.vm, f.vm); err != nil { t.Fatalf("ApplyConfigChange: %v", err) } if n := f.runner.total(); n != 0 { t.Fatalf("runner calls = %d, want 0 when NATEnabled didn't change", n) } } func TestNATCapabilityApplyConfigChange_NoOpWhenVMNotAlive(t *testing.T) { f := newNATCapabilityFixture(t, false) // Clear handles → vmAlive returns false → ApplyConfigChange must // skip rather than attempt a tap-less ensureNAT. f.d.vm.clearVMHandles(f.vm) after := f.vm after.Spec.NATEnabled = true if err := f.cap.ApplyConfigChange(context.Background(), f.vm, after); err != nil { t.Fatalf("ApplyConfigChange: %v", err) } if n := f.runner.total(); n != 0 { t.Fatalf("runner calls = %d, want 0 when VM is not alive", n) } } func TestNATCapabilityApplyConfigChange_TogglesEnsureNATWhenAlive(t *testing.T) { f := newNATCapabilityFixture(t, false) after := f.vm after.Spec.NATEnabled = true if err := f.cap.ApplyConfigChange(context.Background(), f.vm, after); err != nil { t.Fatalf("ApplyConfigChange: %v", err) } if n := f.runner.total(); n == 0 { t.Fatal("runner calls = 0, want ensureNAT to reach the host when toggling NAT on a running VM") } } func TestNATCapabilityCleanup_NoOpWhenNATDisabled(t *testing.T) { f := newNATCapabilityFixture(t, false) if err := f.cap.Cleanup(context.Background(), f.vm); err != nil { t.Fatalf("Cleanup: %v", err) } if n := f.runner.total(); n != 0 { t.Fatalf("runner calls = %d, want 0 when NAT was never enabled", n) } } func TestNATCapabilityCleanup_NoOpWhenRuntimeHandlesMissing(t *testing.T) { f := newNATCapabilityFixture(t, true) // Runtime tap device becomes empty — simulates a VM that failed // before host wiring completed, so Cleanup has nothing to revert. f.d.vm.clearVMHandles(f.vm) if err := f.cap.Cleanup(context.Background(), f.vm); err != nil { t.Fatalf("Cleanup: %v", err) } if n := f.runner.total(); n != 0 { t.Fatalf("runner calls = %d, want 0 when tap/guestIP are empty", n) } } func TestNATCapabilityCleanup_ReversesNATWhenRuntimePresent(t *testing.T) { f := newNATCapabilityFixture(t, true) if err := f.cap.Cleanup(context.Background(), f.vm); err != nil { t.Fatalf("Cleanup: %v", err) } if n := f.runner.total(); n == 0 { t.Fatal("runner calls = 0, want ensureNAT(false) to execute when runtime wiring exists") } } // TestNATCapabilityCleanup_FallsBackToRuntimeTapDevice simulates the // post-crash / corrupt-handles.json scenario: the in-memory handle // cache is empty, but the DB-backed VM.Runtime still carries the // tap name (startVMLocked persists it alongside the handle cache). // Cleanup must use that fallback so the iptables FORWARD rules // keyed on the tap are actually removed — if Cleanup short-circuits // the way it did before this fix, those rules leak forever. func TestNATCapabilityCleanup_FallsBackToRuntimeTapDevice(t *testing.T) { f := newNATCapabilityFixture(t, true) // Wipe the handle cache, as if the daemon had just restarted // against a corrupt (or missing) handles.json. f.d.vm.clearVMHandles(f.vm) // But the VM row in the DB still has the tap recorded. f.vm.Runtime.TapDevice = "tap-nat-42" if err := f.cap.Cleanup(context.Background(), f.vm); err != nil { t.Fatalf("Cleanup: %v", err) } if n := f.runner.total(); n == 0 { t.Fatal("runner calls = 0, want ensureNAT(false) to execute via the Runtime.TapDevice fallback; NAT rules would leak across daemon restarts") } }