banger/internal/daemon/vm_service.go

package daemon

import (
	"context"
	"database/sql"
	"errors"
	"fmt"
	"log/slog"
	"strings"
	"sync"
	"time"

	"banger/internal/daemon/opstate"
	"banger/internal/firecracker"
	"banger/internal/guestconfig"
	"banger/internal/model"
	"banger/internal/paths"
	"banger/internal/store"
	"banger/internal/system"
)

// VMService owns VM lifecycle — create / start / stop / restart /
// kill / delete / set — plus the handle cache, create-operation
// registry, stats polling, disk provisioning, ports query, and the
// SSH-client test seams.
//
// It holds pointers to its peer services (HostNetwork, ImageService,
// WorkspaceService) because VM lifecycle really does orchestrate
// across them (start needs bridge + tap + firecracker + auth sync +
// boot). Defining narrow function-typed interfaces for every peer
// method VMService calls would balloon the diff for no real win —
// services remain unexported within the package so nothing outside
// the daemon can see them.
//
// Capability dispatch goes through the capHooks seam rather than a
// *Daemon pointer, so VMService has no path back to the composition
// root. Daemon.buildCapabilityHooks() populates the seam at wiring
// time with the registered-capabilities loops from capabilities.go.
type VMService struct {
	runner system.CommandRunner
	logger *slog.Logger
	config model.DaemonConfig
	layout paths.Layout
	store  *store.Store

	// vmLocks is the per-VM mutex set. Held across entire lifecycle
	// ops (start, stop, delete, set) — not just the validation window.
	// Workspace.prepare intentionally splits off onto its own lock
	// scope; see WorkspaceService.
	vmLocks    vmLockSet
	createVMMu sync.Mutex
	createOps  opstate.Registry[*vmCreateOperationState]

	// handles caches per-VM transient kernel/process state (PID, tap,
	// loop devices, DM name/device). Rebuildable at daemon startup
	// from a per-VM handles.json scratch file plus OS inspection.
	handles *handleCache

	// Peer services. VMService orchestrates across all three during
	// start/stop/delete; pointer fields keep call sites direct without
	// promoting the peer API to package-level interfaces.
	net *HostNetwork
	img *ImageService
	ws  *WorkspaceService

	// Test seams.
	guestWaitForSSH func(context.Context, string, string, time.Duration) error
	guestDial       func(context.Context, string, string) (guestSSHClient, error)
	// vsockHostDevice is the path preflight + doctor expect to find for
	// the vhost-vsock device. Defaults to defaultVsockHostDevice; tests
	// point at a tempfile so RequireFile passes without needing the
	// real kernel module loaded.
	vsockHostDevice string

	// Capability hook dispatch. VMService invokes capabilities via
	// these seams, populated by Daemon.buildCapabilityHooks() at
	// wiring time. Capability implementations themselves are
	// structs with explicit service-pointer fields (see capabilities.go);
	// VMService never reaches back to *Daemon.
	capHooks capabilityHooks

	beginOperation func(name string, attrs ...any) *operationLog
}

// capabilityHooks bundles the capability-dispatch entry points that
// VMService needs. Populated by Daemon.buildCapabilityHooks() at
// service construction; stubbable in tests that don't care about
// capability side effects.
type capabilityHooks struct {
	addStartPrereqs    func(ctx context.Context, checks *system.Preflight, vm model.VMRecord, image model.Image)
	contributeGuest    func(builder *guestconfig.Builder, vm model.VMRecord, image model.Image)
	contributeMachine  func(cfg *firecracker.MachineConfig, vm model.VMRecord, image model.Image)
	prepareHosts       func(ctx context.Context, vm *model.VMRecord, image model.Image) error
	postStart          func(ctx context.Context, vm model.VMRecord, image model.Image) error
	cleanupState       func(ctx context.Context, vm model.VMRecord) error
	applyConfigChanges func(ctx context.Context, before, after model.VMRecord) error
}

type vmServiceDeps struct {
	runner          system.CommandRunner
	logger          *slog.Logger
	config          model.DaemonConfig
	layout          paths.Layout
	store           *store.Store
	net             *HostNetwork
	img             *ImageService
	ws              *WorkspaceService
	guestWaitForSSH func(context.Context, string, string, time.Duration) error
	guestDial       func(context.Context, string, string) (guestSSHClient, error)
	capHooks        capabilityHooks
	beginOperation  func(name string, attrs ...any) *operationLog
	vsockHostDevice string
}

func newVMService(deps vmServiceDeps) *VMService {
	vsockPath := deps.vsockHostDevice
	if vsockPath == "" {
		vsockPath = defaultVsockHostDevice
	}
	return &VMService{
		runner:          deps.runner,
		logger:          deps.logger,
		config:          deps.config,
		layout:          deps.layout,
		store:           deps.store,
		net:             deps.net,
		img:             deps.img,
		ws:              deps.ws,
		guestWaitForSSH: deps.guestWaitForSSH,
		guestDial:       deps.guestDial,
		capHooks:        deps.capHooks,
		beginOperation:  deps.beginOperation,
		vsockHostDevice: vsockPath,
		handles:         newHandleCache(),
	}
}

// buildCapabilityHooks adapts Daemon's existing capability-dispatch
// methods into the capabilityHooks bag VMService takes. Keeps the
// registry + capability types on *Daemon while letting VMService call
// into them through explicit function seams.
func (d *Daemon) buildCapabilityHooks() capabilityHooks {
	return capabilityHooks{
		addStartPrereqs:    d.addCapabilityStartPrereqs,
		contributeGuest:    d.contributeGuestConfig,
		contributeMachine:  d.contributeMachineConfig,
		prepareHosts:       d.prepareCapabilityHosts,
		postStart:          d.postStartCapabilities,
		cleanupState:       d.cleanupCapabilityState,
		applyConfigChanges: d.applyCapabilityConfigChanges,
	}
}

// FindVM resolves an ID-or-name against the store with the historical
// precedence: exact-ID / exact-name first, then unambiguous prefix
// match. Returns an error when no match is found or when a prefix
// matches more than one record.
func (s *VMService) FindVM(ctx context.Context, idOrName string) (model.VMRecord, error) {
	if idOrName == "" {
		return model.VMRecord{}, errors.New("vm id or name is required")
	}
	if vm, err := s.store.GetVM(ctx, idOrName); err == nil {
		return vm, nil
	}
	vms, err := s.store.ListVMs(ctx)
	if err != nil {
		return model.VMRecord{}, err
	}
	matchCount := 0
	var match model.VMRecord
	for _, vm := range vms {
		if strings.HasPrefix(vm.ID, idOrName) || strings.HasPrefix(vm.Name, idOrName) {
			match = vm
			matchCount++
		}
	}
	if matchCount == 1 {
		return match, nil
	}
	if matchCount > 1 {
		return model.VMRecord{}, fmt.Errorf("multiple VMs match %q", idOrName)
	}
	return model.VMRecord{}, fmt.Errorf("vm %q not found", idOrName)
}

// TouchVM bumps a VM's updated-at timestamp under the per-VM lock.
func (s *VMService) TouchVM(ctx context.Context, idOrName string) (model.VMRecord, error) {
	return s.withVMLockByRef(ctx, idOrName, func(vm model.VMRecord) (model.VMRecord, error) {
		system.TouchNow(&vm)
		if err := s.store.UpsertVM(ctx, vm); err != nil {
			return model.VMRecord{}, err
		}
		return vm, nil
	})
}

// withVMLockByRef resolves idOrName then serialises fn under the
// per-VM lock. Every mutating VM operation funnels through here.
func (s *VMService) withVMLockByRef(ctx context.Context, idOrName string, fn func(model.VMRecord) (model.VMRecord, error)) (model.VMRecord, error) {
	vm, err := s.FindVM(ctx, idOrName)
	if err != nil {
		return model.VMRecord{}, err
	}
	return s.withVMLockByID(ctx, vm.ID, fn)
}

// withVMLockByID locks on the stable VM ID (so a rename mid-flight
// doesn't drop the lock) and re-reads the record under the lock so
// fn sees the committed state.
func (s *VMService) withVMLockByID(ctx context.Context, id string, fn func(model.VMRecord) (model.VMRecord, error)) (model.VMRecord, error) {
	if strings.TrimSpace(id) == "" {
		return model.VMRecord{}, errors.New("vm id is required")
	}
	unlock := s.lockVMID(id)
	defer unlock()

	vm, err := s.store.GetVMByID(ctx, id)
	if err != nil {
		if errors.Is(err, sql.ErrNoRows) {
			return model.VMRecord{}, fmt.Errorf("vm %q not found", id)
		}
		return model.VMRecord{}, err
	}
	return fn(vm)
}

// withVMLockByIDErr is the error-only variant of withVMLockByID for
// callers that don't need the returned record.
func (s *VMService) withVMLockByIDErr(ctx context.Context, id string, fn func(model.VMRecord) error) error {
	_, err := s.withVMLockByID(ctx, id, func(vm model.VMRecord) (model.VMRecord, error) {
		if err := fn(vm); err != nil {
			return model.VMRecord{}, err
		}
		return vm, nil
	})
	return err
}

// lockVMID exposes the per-VM mutex for callers that need to hold it
// outside the usual withVMLockByRef/withVMLockByID helpers
// (workspace prepare, for example).
func (s *VMService) lockVMID(id string) func() {
	return s.vmLocks.lock(id)
}