daemon split (1/5): extract *HostNetwork service

First phase of splitting the daemon god-struct into focused services
with explicit ownership.

HostNetwork now owns everything host-networking: the TAP interface
pool (initializeTapPool / ensureTapPool / acquireTap / releaseTap /
createTap), bridge + socket dir setup, firecracker process primitives
(find/resolve/kill/wait/ensureSocketAccess/sendCtrlAltDel), DM
snapshot lifecycle, NAT rule enforcement, guest DNS server lifecycle
+ routing setup, and the vsock-agent readiness probe. That's 7 files
whose receivers flipped from *Daemon to *HostNetwork, plus a new
host_network.go that declares the struct, its hostNetworkDeps, and
the factored firecracker + DNS helpers that used to live in vm.go.

Daemon gives up the tapPool and vmDNS fields entirely; they're now
HostNetwork's business. Construction goes through newHostNetwork in
Daemon.Open with an explicit dependency bag (runner, logger, config,
layout, closing). A lazy-init hostNet() helper on Daemon supports
test literals that don't wire net explicitly — production always
populates it eagerly.

Signature tightenings where the old receiver reached into VM-service
state:
 - ensureNAT(ctx, vm, enable) → ensureNAT(ctx, guestIP, tap, enable).
   Callers resolve tap from the handle cache themselves.
 - initializeTapPool(ctx) → initializeTapPool(usedTaps []string).
   Daemon.Open enumerates VMs, collects taps from handles, hands the
   slice in.

rebuildDNS stays on *Daemon as the orchestrator — it filters by
vm-alive (a VMService concern handles will move to in phase 4) then
calls HostNetwork.replaceDNS with the already-filtered map.

Capability hooks continue to take *Daemon; they now use it as a
facade to reach services (d.net.ensureNAT, d.hostNet().*). Planned
CapabilityHost interface extraction is orthogonal, left for later.

Tests: dns_routing_test.go + fastpath_test.go + nat_test.go +
snapshot_test.go + open_close_test.go were touched to construct
HostNetwork literals where they exercise its methods directly, or
route through d.hostNet() where they exercise the Daemon entry
points.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Thales Maciel 2026-04-20 20:11:46 -03:00
parent eba9a553bf
commit 362009d747
No known key found for this signature in database
GPG key ID: 33112E6833C34679
18 changed files with 461 additions and 326 deletions

View file

@ -4,23 +4,20 @@ import (
"context"
"errors"
"fmt"
"log/slog"
"net"
"os"
"path/filepath"
"strconv"
"strings"
"time"
"banger/internal/daemon/fcproc"
"banger/internal/firecracker"
"banger/internal/model"
"banger/internal/namegen"
"banger/internal/system"
"banger/internal/vmdns"
"banger/internal/vsockagent"
)
// Cross-service constants. Kept in vm.go because both lifecycle
// (VMService) and networking (HostNetwork) reference them; moving
// them to either owner would read as a layering violation.
var (
errWaitForExitTimeout = fcproc.ErrWaitForExitTimeout
gracefulShutdownWait = 10 * time.Second
@ -28,59 +25,43 @@ var (
vsockReadyPoll = 200 * time.Millisecond
)
// fc builds a fresh fcproc.Manager from the Daemon's current runner, config,
// and layout. Manager is stateless beyond those handles, so constructing per
// call keeps tests that build Daemon literals working without extra wiring.
func (d *Daemon) fc() *fcproc.Manager {
return fcproc.New(d.runner, fcproc.Config{
FirecrackerBin: d.config.FirecrackerBin,
BridgeName: d.config.BridgeName,
BridgeIP: d.config.BridgeIP,
CIDR: d.config.CIDR,
RuntimeDir: d.layout.RuntimeDir,
}, d.logger)
// rebuildDNS enumerates live VMs and republishes the DNS record set.
// Lives on *Daemon (not HostNetwork) because "alive" is a VMService
// concern that HostNetwork shouldn't need to reach into. Daemon
// orchestrates: VM list from the store, alive filter, hand the
// resulting map to HostNetwork.replaceDNS.
func (d *Daemon) rebuildDNS(ctx context.Context) error {
if d.net == nil {
return nil
}
vms, err := d.store.ListVMs(ctx)
if err != nil {
return err
}
records := make(map[string]string)
for _, vm := range vms {
if !d.vmAlive(vm) {
continue
}
if strings.TrimSpace(vm.Runtime.GuestIP) == "" {
continue
}
records[vmDNSRecordName(vm.Name)] = vm.Runtime.GuestIP
}
return d.hostNet().replaceDNS(records)
}
func (d *Daemon) ensureBridge(ctx context.Context) error {
return d.fc().EnsureBridge(ctx)
}
func (d *Daemon) ensureSocketDir() error {
return d.fc().EnsureSocketDir()
}
func (d *Daemon) createTap(ctx context.Context, tap string) error {
return d.fc().CreateTap(ctx, tap)
}
func (d *Daemon) firecrackerBinary() (string, error) {
return d.fc().ResolveBinary()
}
func (d *Daemon) ensureSocketAccess(ctx context.Context, socketPath, label string) error {
return d.fc().EnsureSocketAccess(ctx, socketPath, label)
}
func (d *Daemon) findFirecrackerPID(ctx context.Context, apiSock string) (int, error) {
return d.fc().FindPID(ctx, apiSock)
}
func (d *Daemon) resolveFirecrackerPID(ctx context.Context, machine *firecracker.Machine, apiSock string) int {
return d.fc().ResolvePID(ctx, machine, apiSock)
}
func (d *Daemon) sendCtrlAltDel(ctx context.Context, vm model.VMRecord) error {
return d.fc().SendCtrlAltDel(ctx, vm.Runtime.APISockPath)
}
func (d *Daemon) waitForExit(ctx context.Context, pid int, apiSock string, timeout time.Duration) error {
return d.fc().WaitForExit(ctx, pid, apiSock, timeout)
}
func (d *Daemon) killVMProcess(ctx context.Context, pid int) error {
return d.fc().Kill(ctx, pid)
// vmDNSRecordName is a small indirection so the dns-record-name
// helper is not directly pulled into every file that used to import
// vmdns for this one call. Equivalent to vmdns.RecordName.
func vmDNSRecordName(name string) string {
return strings.ToLower(strings.TrimSpace(name)) + ".vm"
}
// cleanupRuntime tears down the host-side state for a VM: firecracker
// process, DM snapshot, capabilities, tap, sockets. Stays on *Daemon
// for now because it reaches into handles (VMService-owned) and
// capabilities (still on Daemon). Phase 4 will move it to VMService.
func (d *Daemon) cleanupRuntime(ctx context.Context, vm model.VMRecord, preserveDisks bool) error {
if d.logger != nil {
d.logger.Debug("cleanup runtime", append(vmLogAttrs(vm), "preserve_disks", preserveDisks)...)
@ -88,17 +69,17 @@ func (d *Daemon) cleanupRuntime(ctx context.Context, vm model.VMRecord, preserve
h := d.vmHandles(vm.ID)
cleanupPID := h.PID
if vm.Runtime.APISockPath != "" {
if pid, err := d.findFirecrackerPID(ctx, vm.Runtime.APISockPath); err == nil && pid > 0 {
if pid, err := d.hostNet().findFirecrackerPID(ctx, vm.Runtime.APISockPath); err == nil && pid > 0 {
cleanupPID = pid
}
}
if cleanupPID > 0 && system.ProcessRunning(cleanupPID, vm.Runtime.APISockPath) {
_ = d.killVMProcess(ctx, cleanupPID)
if err := d.waitForExit(ctx, cleanupPID, vm.Runtime.APISockPath, 30*time.Second); err != nil {
_ = d.hostNet().killVMProcess(ctx, cleanupPID)
if err := d.hostNet().waitForExit(ctx, cleanupPID, vm.Runtime.APISockPath, 30*time.Second); err != nil {
return err
}
}
snapshotErr := d.cleanupDMSnapshot(ctx, dmSnapshotHandles{
snapshotErr := d.hostNet().cleanupDMSnapshot(ctx, dmSnapshotHandles{
BaseLoop: h.BaseLoop,
COWLoop: h.COWLoop,
DMName: h.DMName,
@ -107,7 +88,7 @@ func (d *Daemon) cleanupRuntime(ctx context.Context, vm model.VMRecord, preserve
featureErr := d.cleanupCapabilityState(ctx, vm)
var tapErr error
if h.TapDevice != "" {
tapErr = d.releaseTap(ctx, h.TapDevice)
tapErr = d.hostNet().releaseTap(ctx, h.TapDevice)
}
if vm.Runtime.APISockPath != "" {
_ = os.Remove(vm.Runtime.APISockPath)
@ -125,92 +106,6 @@ func (d *Daemon) cleanupRuntime(ctx context.Context, vm model.VMRecord, preserve
return errors.Join(snapshotErr, featureErr, tapErr)
}
func defaultVSockPath(runtimeDir, vmID string) string {
return filepath.Join(runtimeDir, "fc-"+system.ShortID(vmID)+".vsock")
}
func defaultVSockCID(guestIP string) (uint32, error) {
ip := net.ParseIP(strings.TrimSpace(guestIP)).To4()
if ip == nil {
return 0, fmt.Errorf("guest IP is not IPv4: %q", guestIP)
}
return 10000 + uint32(ip[3]), nil
}
func waitForGuestVSockAgent(ctx context.Context, logger *slog.Logger, socketPath string, timeout time.Duration) error {
if strings.TrimSpace(socketPath) == "" {
return errors.New("vsock path is required")
}
waitCtx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
ticker := time.NewTicker(vsockReadyPoll)
defer ticker.Stop()
var lastErr error
for {
pingCtx, pingCancel := context.WithTimeout(waitCtx, 3*time.Second)
err := vsockagent.Health(pingCtx, logger, socketPath)
pingCancel()
if err == nil {
return nil
}
lastErr = err
select {
case <-waitCtx.Done():
if lastErr != nil {
return fmt.Errorf("guest vsock agent not ready: %w", lastErr)
}
return errors.New("guest vsock agent not ready before timeout")
case <-ticker.C:
}
}
}
func (d *Daemon) setDNS(ctx context.Context, vmName, guestIP string) error {
if d.vmDNS == nil {
return nil
}
if err := d.vmDNS.Set(vmdns.RecordName(vmName), guestIP); err != nil {
return err
}
d.ensureVMDNSResolverRouting(ctx)
return nil
}
func (d *Daemon) removeDNS(ctx context.Context, dnsName string) error {
if dnsName == "" {
return nil
}
if d.vmDNS == nil {
return nil
}
return d.vmDNS.Remove(dnsName)
}
func (d *Daemon) rebuildDNS(ctx context.Context) error {
if d.vmDNS == nil {
return nil
}
vms, err := d.store.ListVMs(ctx)
if err != nil {
return err
}
records := make(map[string]string)
for _, vm := range vms {
if !d.vmAlive(vm) {
continue
}
if strings.TrimSpace(vm.Runtime.GuestIP) == "" {
continue
}
records[vmdns.RecordName(vm.Name)] = vm.Runtime.GuestIP
}
return d.vmDNS.Replace(records)
}
func (d *Daemon) generateName(ctx context.Context) (string, error) {
_ = ctx
if name := strings.TrimSpace(namegen.Generate()); name != "" {