daemon split (4/5): extract *VMService service

Phase 4 of the daemon god-struct refactor. VM lifecycle, create-op registry, handle cache, disk provisioning, stats polling, ports query, and the per-VM lock set all move off *Daemon onto *VMService. Daemon keeps thin forwarders only for FindVM / TouchVM (dispatch surface) and is otherwise out of VM lifecycle. Lazy-init via d.vmSvc() mirrors the earlier services so test literals like \`&Daemon{store: db, runner: r}\` still get a functional service without spelling one out. Three small cleanups along the way: * preflight helpers (validateStartPrereqs / addBaseStartPrereqs / addBaseStartCommandPrereqs / validateWorkDiskResizePrereqs) move with the VM methods that call them. * cleanupRuntime / rebuildDNS move to *VMService, with HostNetwork primitives (findFirecrackerPID, cleanupDMSnapshot, killVMProcess, releaseTap, waitForExit, sendCtrlAltDel) reached through s.net instead of the hostNet() facade. * vsockAgentBinary becomes a package-level function so both *Daemon (doctor) and *VMService (preflight) call one entry point instead of each owning a forwarder method. WorkspaceService's peer deps switch from eager method values to closures — vmSvc() constructs VMService with WorkspaceService as a peer, so resolving d.vmSvc().FindVM at construction time recursed through workspaceSvc() → vmSvc(). Closures defer the lookup to call time. Pure code motion: build + unit tests green, lint clean. No RPC surface or lock-ordering changes. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 20:57:05 -03:00 · 2026-04-20 20:57:05 -03:00 · 466a7c30c4
commit 466a7c30c4
parent c0d456e734
23 changed files with 655 additions and 463 deletions
--- a/internal/daemon/vm_service.go
+++ b/internal/daemon/vm_service.go
@ -0,0 +1,256 @@
+package daemon
+
+import (
+	"context"
+	"database/sql"
+	"errors"
+	"fmt"
+	"log/slog"
+	"strings"
+	"sync"
+	"time"
+
+	"banger/internal/daemon/opstate"
+	"banger/internal/firecracker"
+	"banger/internal/guestconfig"
+	"banger/internal/model"
+	"banger/internal/paths"
+	"banger/internal/store"
+	"banger/internal/system"
+)
+
+// VMService owns VM lifecycle — create / start / stop / restart /
+// kill / delete / set — plus the handle cache, create-operation
+// registry, stats polling, disk provisioning, ports query, and the
+// SSH-client test seams.
+//
+// It holds pointers to its peer services (HostNetwork, ImageService,
+// WorkspaceService) because VM lifecycle really does orchestrate
+// across them (start needs bridge + tap + firecracker + auth sync +
+// boot). Defining narrow function-typed interfaces for every peer
+// method VMService calls would balloon the diff for no real win —
+// services remain unexported within the package so nothing outside
+// the daemon can see them.
+//
+// Capability invocation still runs through Daemon because the hook
+// interfaces take *Daemon directly. VMService calls back via the
+// capHooks seam rather than holding a *Daemon pointer, to keep the
+// dependency graph acyclic.
+type VMService struct {
+	runner system.CommandRunner
+	logger *slog.Logger
+	config model.DaemonConfig
+	layout paths.Layout
+	store  *store.Store
+
+	// vmLocks is the per-VM mutex set. Held across entire lifecycle
+	// ops (start, stop, delete, set) — not just the validation window.
+	// Workspace.prepare intentionally splits off onto its own lock
+	// scope; see WorkspaceService.
+	vmLocks    vmLockSet
+	createVMMu sync.Mutex
+	createOps  opstate.Registry[*vmCreateOperationState]
+
+	// handles caches per-VM transient kernel/process state (PID, tap,
+	// loop devices, DM name/device). Rebuildable at daemon startup
+	// from a per-VM handles.json scratch file plus OS inspection.
+	handles *handleCache
+
+	// Peer services. VMService orchestrates across all three during
+	// start/stop/delete; pointer fields keep call sites direct without
+	// promoting the peer API to package-level interfaces.
+	net *HostNetwork
+	img *ImageService
+	ws  *WorkspaceService
+
+	// Test seams.
+	guestWaitForSSH func(context.Context, string, string, time.Duration) error
+	guestDial       func(context.Context, string, string) (guestSSHClient, error)
+
+	// Capability hook dispatch. Capabilities themselves live on
+	// *Daemon (their interface takes *Daemon as receiver); VMService
+	// invokes them via these seams so it doesn't need a *Daemon
+	// pointer.
+	capHooks capabilityHooks
+
+	beginOperation func(name string, attrs ...any) *operationLog
+}
+
+// capabilityHooks bundles the capability-dispatch entry points that
+// VMService needs. Populated by Daemon.buildCapabilityHooks() at
+// service construction; stubbable in tests that don't care about
+// capability side effects.
+type capabilityHooks struct {
+	addStartPrereqs    func(ctx context.Context, checks *system.Preflight, vm model.VMRecord, image model.Image)
+	contributeGuest    func(builder *guestconfig.Builder, vm model.VMRecord, image model.Image)
+	contributeMachine  func(cfg *firecracker.MachineConfig, vm model.VMRecord, image model.Image)
+	prepareHosts       func(ctx context.Context, vm *model.VMRecord, image model.Image) error
+	postStart          func(ctx context.Context, vm model.VMRecord, image model.Image) error
+	cleanupState       func(ctx context.Context, vm model.VMRecord) error
+	applyConfigChanges func(ctx context.Context, before, after model.VMRecord) error
+}
+
+type vmServiceDeps struct {
+	runner          system.CommandRunner
+	logger          *slog.Logger
+	config          model.DaemonConfig
+	layout          paths.Layout
+	store           *store.Store
+	net             *HostNetwork
+	img             *ImageService
+	ws              *WorkspaceService
+	guestWaitForSSH func(context.Context, string, string, time.Duration) error
+	guestDial       func(context.Context, string, string) (guestSSHClient, error)
+	capHooks        capabilityHooks
+	beginOperation  func(name string, attrs ...any) *operationLog
+}
+
+func newVMService(deps vmServiceDeps) *VMService {
+	return &VMService{
+		runner:          deps.runner,
+		logger:          deps.logger,
+		config:          deps.config,
+		layout:          deps.layout,
+		store:           deps.store,
+		net:             deps.net,
+		img:             deps.img,
+		ws:              deps.ws,
+		guestWaitForSSH: deps.guestWaitForSSH,
+		guestDial:       deps.guestDial,
+		capHooks:        deps.capHooks,
+		beginOperation:  deps.beginOperation,
+		handles:         newHandleCache(),
+	}
+}
+
+// vmSvc is Daemon's lazy-init getter. Mirrors hostNet() / imageSvc() /
+// workspaceSvc() so test literals like `&Daemon{store: db, runner: r}`
+// still get a functional VMService without spelling one out.
+func (d *Daemon) vmSvc() *VMService {
+	if d.vm != nil {
+		return d.vm
+	}
+	d.vm = newVMService(vmServiceDeps{
+		runner:          d.runner,
+		logger:          d.logger,
+		config:          d.config,
+		layout:          d.layout,
+		store:           d.store,
+		net:             d.hostNet(),
+		img:             d.imageSvc(),
+		ws:              d.workspaceSvc(),
+		guestWaitForSSH: d.guestWaitForSSH,
+		guestDial:       d.guestDial,
+		capHooks:        d.buildCapabilityHooks(),
+		beginOperation:  d.beginOperation,
+	})
+	return d.vm
+}
+
+// buildCapabilityHooks adapts Daemon's existing capability-dispatch
+// methods into the capabilityHooks bag VMService takes. Keeps the
+// registry + capability types on *Daemon while letting VMService call
+// into them through explicit function seams.
+func (d *Daemon) buildCapabilityHooks() capabilityHooks {
+	return capabilityHooks{
+		addStartPrereqs:    d.addCapabilityStartPrereqs,
+		contributeGuest:    d.contributeGuestConfig,
+		contributeMachine:  d.contributeMachineConfig,
+		prepareHosts:       d.prepareCapabilityHosts,
+		postStart:          d.postStartCapabilities,
+		cleanupState:       d.cleanupCapabilityState,
+		applyConfigChanges: d.applyCapabilityConfigChanges,
+	}
+}
+
+// FindVM resolves an ID-or-name against the store with the historical
+// precedence: exact-ID / exact-name first, then unambiguous prefix
+// match. Returns an error when no match is found or when a prefix
+// matches more than one record.
+func (s *VMService) FindVM(ctx context.Context, idOrName string) (model.VMRecord, error) {
+	if idOrName == "" {
+		return model.VMRecord{}, errors.New("vm id or name is required")
+	}
+	if vm, err := s.store.GetVM(ctx, idOrName); err == nil {
+		return vm, nil
+	}
+	vms, err := s.store.ListVMs(ctx)
+	if err != nil {
+		return model.VMRecord{}, err
+	}
+	matchCount := 0
+	var match model.VMRecord
+	for _, vm := range vms {
+		if strings.HasPrefix(vm.ID, idOrName) || strings.HasPrefix(vm.Name, idOrName) {
+			match = vm
+			matchCount++
+		}
+	}
+	if matchCount == 1 {
+		return match, nil
+	}
+	if matchCount > 1 {
+		return model.VMRecord{}, fmt.Errorf("multiple VMs match %q", idOrName)
+	}
+	return model.VMRecord{}, fmt.Errorf("vm %q not found", idOrName)
+}
+
+// TouchVM bumps a VM's updated-at timestamp under the per-VM lock.
+func (s *VMService) TouchVM(ctx context.Context, idOrName string) (model.VMRecord, error) {
+	return s.withVMLockByRef(ctx, idOrName, func(vm model.VMRecord) (model.VMRecord, error) {
+		system.TouchNow(&vm)
+		if err := s.store.UpsertVM(ctx, vm); err != nil {
+			return model.VMRecord{}, err
+		}
+		return vm, nil
+	})
+}
+
+// withVMLockByRef resolves idOrName then serialises fn under the
+// per-VM lock. Every mutating VM operation funnels through here.
+func (s *VMService) withVMLockByRef(ctx context.Context, idOrName string, fn func(model.VMRecord) (model.VMRecord, error)) (model.VMRecord, error) {
+	vm, err := s.FindVM(ctx, idOrName)
+	if err != nil {
+		return model.VMRecord{}, err
+	}
+	return s.withVMLockByID(ctx, vm.ID, fn)
+}
+
+// withVMLockByID locks on the stable VM ID (so a rename mid-flight
+// doesn't drop the lock) and re-reads the record under the lock so
+// fn sees the committed state.
+func (s *VMService) withVMLockByID(ctx context.Context, id string, fn func(model.VMRecord) (model.VMRecord, error)) (model.VMRecord, error) {
+	if strings.TrimSpace(id) == "" {
+		return model.VMRecord{}, errors.New("vm id is required")
+	}
+	unlock := s.lockVMID(id)
+	defer unlock()
+
+	vm, err := s.store.GetVMByID(ctx, id)
+	if err != nil {
+		if errors.Is(err, sql.ErrNoRows) {
+			return model.VMRecord{}, fmt.Errorf("vm %q not found", id)
+		}
+		return model.VMRecord{}, err
+	}
+	return fn(vm)
+}
+
+// withVMLockByIDErr is the error-only variant of withVMLockByID for
+// callers that don't need the returned record.
+func (s *VMService) withVMLockByIDErr(ctx context.Context, id string, fn func(model.VMRecord) error) error {
+	_, err := s.withVMLockByID(ctx, id, func(vm model.VMRecord) (model.VMRecord, error) {
+		if err := fn(vm); err != nil {
+			return model.VMRecord{}, err
+		}
+		return vm, nil
+	})
+	return err
+}
+
+// lockVMID exposes the per-VM mutex for callers that need to hold it
+// outside the usual withVMLockByRef/withVMLockByID helpers
+// (workspace prepare, for example).
+func (s *VMService) lockVMID(id string) func() {
+	return s.vmLocks.lock(id)
+}