banger/internal/daemon/nat_capability_test.go

package daemon

import (
	"context"
	"path/filepath"
	"sync/atomic"
	"testing"
	"time"

	"banger/internal/model"
)

// waitForVMAlive polls until VMService.vmAlive reports true for vm or
// t fails out. Bounded so a broken fake can't hang the suite.
func waitForVMAlive(t *testing.T, svc *VMService, vm model.VMRecord) {
	t.Helper()
	deadline := time.Now().Add(2 * time.Second)
	for {
		if svc.vmAlive(vm) {
			return
		}
		if time.Now().After(deadline) {
			t.Fatal("fake firecracker never became alive per VMService.vmAlive")
		}
		time.Sleep(5 * time.Millisecond)
	}
}

// countingRunner records Run/RunSudo invocations without caring about
// the specific commands. Good enough for tests that want to assert
// "did the nat capability reach the host at all?" — hostnat.Ensure's
// exact iptables/sysctl sequence is covered in the hostnat package
// tests, so we don't re-enumerate it here.
type countingRunner struct {
	runs     atomic.Int32
	runSudos atomic.Int32
	out      []byte
	err      error
}

func (r *countingRunner) Run(_ context.Context, _ string, _ ...string) ([]byte, error) {
	r.runs.Add(1)
	return r.out, r.err
}

func (r *countingRunner) RunSudo(_ context.Context, _ ...string) ([]byte, error) {
	r.runSudos.Add(1)
	return r.out, r.err
}

func (r *countingRunner) total() int32 { return r.runs.Load() + r.runSudos.Load() }

// natCapabilityFixture wires just enough daemon state for natCapability
// tests: a HostNetwork + VMService with a countingRunner, a VM record
// whose handles carry a tap device, and the capability itself.
type natCapabilityFixture struct {
	cap    natCapability
	runner *countingRunner
	d      *Daemon
	vm     model.VMRecord
}

func newNATCapabilityFixture(t *testing.T, natEnabled bool) natCapabilityFixture {
	t.Helper()
	runner := &countingRunner{out: []byte("default via 10.0.0.1 dev eth0 proto static\n")}
	d := &Daemon{
		runner: runner,
		config: model.DaemonConfig{BridgeName: model.DefaultBridgeName},
	}
	wireServices(d)
	d.net.runner = runner

	// A real firecracker-looking subprocess so VMService.vmAlive — which
	// reads /proc/<pid>/cmdline and checks for "firecracker" + the api
	// socket path — returns true. Without this the ApplyConfigChange
	// "alive vs not alive" branches can't be exercised.
	apiSock := filepath.Join(t.TempDir(), "fc.sock")
	fc := startFakeFirecracker(t, apiSock)

	vm := testVM("natbox", "image-nat", "172.16.0.42")
	vm.Spec.NATEnabled = natEnabled
	vm.State = model.VMStateRunning
	vm.Runtime.State = model.VMStateRunning
	vm.Runtime.APISockPath = apiSock
	d.vm.setVMHandlesInMemory(vm.ID, model.VMHandles{
		PID:       fc.Process.Pid,
		TapDevice: "tap-nat-42",
	})

	// startFakeFirecracker uses `exec -a firecracker ...` which renames
	// the process after Start returns — on a loaded CI box vmAlive can
	// observe the pre-exec cmdline ("bash") for a few ms and false-
	// negative. Poll until /proc shows the firecracker name so the
	// fixture hands back a VM that's definitely "alive" by banger's
	// rules.
	waitForVMAlive(t, d.vm, vm)

	return natCapabilityFixture{
		cap:    newNATCapability(d.vm, d.net, d.logger),
		runner: runner,
		d:      d,
		vm:     vm,
	}
}

func TestNATCapabilityApplyConfigChange_NoOpWhenFlagUnchanged(t *testing.T) {
	f := newNATCapabilityFixture(t, true)
	if err := f.cap.ApplyConfigChange(context.Background(), f.vm, f.vm); err != nil {
		t.Fatalf("ApplyConfigChange: %v", err)
	}
	if n := f.runner.total(); n != 0 {
		t.Fatalf("runner calls = %d, want 0 when NATEnabled didn't change", n)
	}
}

func TestNATCapabilityApplyConfigChange_NoOpWhenVMNotAlive(t *testing.T) {
	f := newNATCapabilityFixture(t, false)
	// Clear handles → vmAlive returns false → ApplyConfigChange must
	// skip rather than attempt a tap-less ensureNAT.
	f.d.vm.clearVMHandles(f.vm)

	after := f.vm
	after.Spec.NATEnabled = true
	if err := f.cap.ApplyConfigChange(context.Background(), f.vm, after); err != nil {
		t.Fatalf("ApplyConfigChange: %v", err)
	}
	if n := f.runner.total(); n != 0 {
		t.Fatalf("runner calls = %d, want 0 when VM is not alive", n)
	}
}

func TestNATCapabilityApplyConfigChange_TogglesEnsureNATWhenAlive(t *testing.T) {
	f := newNATCapabilityFixture(t, false)
	after := f.vm
	after.Spec.NATEnabled = true
	if err := f.cap.ApplyConfigChange(context.Background(), f.vm, after); err != nil {
		t.Fatalf("ApplyConfigChange: %v", err)
	}
	if n := f.runner.total(); n == 0 {
		t.Fatal("runner calls = 0, want ensureNAT to reach the host when toggling NAT on a running VM")
	}
}

func TestNATCapabilityCleanup_NoOpWhenNATDisabled(t *testing.T) {
	f := newNATCapabilityFixture(t, false)
	if err := f.cap.Cleanup(context.Background(), f.vm); err != nil {
		t.Fatalf("Cleanup: %v", err)
	}
	if n := f.runner.total(); n != 0 {
		t.Fatalf("runner calls = %d, want 0 when NAT was never enabled", n)
	}
}

func TestNATCapabilityCleanup_NoOpWhenRuntimeHandlesMissing(t *testing.T) {
	f := newNATCapabilityFixture(t, true)
	// Runtime tap device becomes empty — simulates a VM that failed
	// before host wiring completed, so Cleanup has nothing to revert.
	f.d.vm.clearVMHandles(f.vm)

	if err := f.cap.Cleanup(context.Background(), f.vm); err != nil {
		t.Fatalf("Cleanup: %v", err)
	}
	if n := f.runner.total(); n != 0 {
		t.Fatalf("runner calls = %d, want 0 when tap/guestIP are empty", n)
	}
}

func TestNATCapabilityCleanup_ReversesNATWhenRuntimePresent(t *testing.T) {
	f := newNATCapabilityFixture(t, true)
	if err := f.cap.Cleanup(context.Background(), f.vm); err != nil {
		t.Fatalf("Cleanup: %v", err)
	}
	if n := f.runner.total(); n == 0 {
		t.Fatal("runner calls = 0, want ensureNAT(false) to execute when runtime wiring exists")
	}
}

// TestNATCapabilityCleanup_FallsBackToRuntimeTapDevice simulates the
// post-crash / corrupt-handles.json scenario: the in-memory handle
// cache is empty, but the DB-backed VM.Runtime still carries the
// tap name (startVMLocked persists it alongside the handle cache).
// Cleanup must use that fallback so the iptables FORWARD rules
// keyed on the tap are actually removed — if Cleanup short-circuits
// the way it did before this fix, those rules leak forever.
func TestNATCapabilityCleanup_FallsBackToRuntimeTapDevice(t *testing.T) {
	f := newNATCapabilityFixture(t, true)
	// Wipe the handle cache, as if the daemon had just restarted
	// against a corrupt (or missing) handles.json.
	f.d.vm.clearVMHandles(f.vm)
	// But the VM row in the DB still has the tap recorded.
	f.vm.Runtime.TapDevice = "tap-nat-42"

	if err := f.cap.Cleanup(context.Background(), f.vm); err != nil {
		t.Fatalf("Cleanup: %v", err)
	}
	if n := f.runner.total(); n == 0 {
		t.Fatal("runner calls = 0, want ensureNAT(false) to execute via the Runtime.TapDevice fallback; NAT rules would leak across daemon restarts")
	}
}