banger/internal/daemon/vm_service.go
Thales Maciel 59e48e830b
daemon: split owner daemon from root helper
Move the supported systemd path to two services: an owner-user bangerd for
orchestration and a narrow root helper for bridge/tap, NAT/resolver, dm/loop,
and Firecracker ownership. This removes repeated sudo from daily vm and image
flows without leaving the general daemon running as root.

Add install metadata, system install/status/restart/uninstall commands, and a
system-owned runtime layout. Keep user SSH/config material in the owner home,
lock file_sync to the owner home, and move daemon known_hosts handling out of
the old root-owned control path.

Route privileged lifecycle steps through typed privilegedOps calls, harden the
two systemd units, and rewrite smoke plus docs around the supported service
model.

Verified with make build, make test, make lint, and make smoke on the
supported systemd host path.
2026-04-26 12:43:17 -03:00

239 lines
8.4 KiB
Go

package daemon
import (
"context"
"database/sql"
"errors"
"fmt"
"log/slog"
"strings"
"sync"
"banger/internal/daemon/opstate"
"banger/internal/firecracker"
"banger/internal/guestconfig"
"banger/internal/model"
"banger/internal/paths"
"banger/internal/store"
"banger/internal/system"
)
// VMService owns VM lifecycle — create / start / stop / restart /
// kill / delete / set — plus the handle cache, create-operation
// registry, stats polling, disk provisioning, ports query, and the
// SSH-client test seams.
//
// It holds pointers to its peer services (HostNetwork, ImageService,
// WorkspaceService) because VM lifecycle really does orchestrate
// across them (start needs bridge + tap + firecracker + auth sync +
// boot). Defining narrow function-typed interfaces for every peer
// method VMService calls would balloon the diff for no real win —
// services remain unexported within the package so nothing outside
// the daemon can see them.
//
// Capability dispatch goes through the capHooks seam rather than a
// *Daemon pointer, so VMService has no path back to the composition
// root. Daemon.buildCapabilityHooks() populates the seam at wiring
// time with the registered-capabilities loops from capabilities.go.
type VMService struct {
runner system.CommandRunner
logger *slog.Logger
config model.DaemonConfig
layout paths.Layout
store *store.Store
// vmLocks is the per-VM mutex set. Held across entire lifecycle
// ops (start, stop, delete, set) — not just the validation window.
// Workspace.prepare intentionally splits off onto its own lock
// scope; see WorkspaceService.
vmLocks vmLockSet
createVMMu sync.Mutex
createOps opstate.Registry[*vmCreateOperationState]
// handles caches per-VM transient kernel/process state (PID, tap,
// loop devices, DM name/device). Rebuildable at daemon startup
// from a per-VM handles.json scratch file plus OS inspection.
handles *handleCache
// Peer services. VMService orchestrates across all three during
// start/stop/delete; pointer fields keep call sites direct without
// promoting the peer API to package-level interfaces.
net *HostNetwork
img *ImageService
ws *WorkspaceService
priv privilegedOps
// vsockHostDevice is the path preflight + doctor expect to find for
// the vhost-vsock device. Defaults to defaultVsockHostDevice; tests
// point at a tempfile so RequireFile passes without needing the
// real kernel module loaded.
vsockHostDevice string
// Capability hook dispatch. VMService invokes capabilities via
// these seams, populated by Daemon.buildCapabilityHooks() at
// wiring time. Capability implementations themselves are
// structs with explicit service-pointer fields (see capabilities.go);
// VMService never reaches back to *Daemon.
capHooks capabilityHooks
beginOperation func(name string, attrs ...any) *operationLog
}
// capabilityHooks bundles the capability-dispatch entry points that
// VMService needs. Populated by Daemon.buildCapabilityHooks() at
// service construction; stubbable in tests that don't care about
// capability side effects.
type capabilityHooks struct {
addStartPrereqs func(ctx context.Context, checks *system.Preflight, vm model.VMRecord, image model.Image)
contributeGuest func(builder *guestconfig.Builder, vm model.VMRecord, image model.Image)
contributeMachine func(cfg *firecracker.MachineConfig, vm model.VMRecord, image model.Image)
prepareHosts func(ctx context.Context, vm *model.VMRecord, image model.Image) error
postStart func(ctx context.Context, vm model.VMRecord, image model.Image) error
cleanupState func(ctx context.Context, vm model.VMRecord) error
applyConfigChanges func(ctx context.Context, before, after model.VMRecord) error
}
type vmServiceDeps struct {
runner system.CommandRunner
logger *slog.Logger
config model.DaemonConfig
layout paths.Layout
store *store.Store
net *HostNetwork
img *ImageService
ws *WorkspaceService
priv privilegedOps
capHooks capabilityHooks
beginOperation func(name string, attrs ...any) *operationLog
vsockHostDevice string
}
func newVMService(deps vmServiceDeps) *VMService {
vsockPath := deps.vsockHostDevice
if vsockPath == "" {
vsockPath = defaultVsockHostDevice
}
return &VMService{
runner: deps.runner,
logger: deps.logger,
config: deps.config,
layout: deps.layout,
store: deps.store,
net: deps.net,
img: deps.img,
ws: deps.ws,
priv: deps.priv,
capHooks: deps.capHooks,
beginOperation: deps.beginOperation,
vsockHostDevice: vsockPath,
handles: newHandleCache(),
}
}
// buildCapabilityHooks adapts Daemon's existing capability-dispatch
// methods into the capabilityHooks bag VMService takes. Keeps the
// registry + capability types on *Daemon while letting VMService call
// into them through explicit function seams.
func (d *Daemon) buildCapabilityHooks() capabilityHooks {
return capabilityHooks{
addStartPrereqs: d.addCapabilityStartPrereqs,
contributeGuest: d.contributeGuestConfig,
contributeMachine: d.contributeMachineConfig,
prepareHosts: d.prepareCapabilityHosts,
postStart: d.postStartCapabilities,
cleanupState: d.cleanupCapabilityState,
applyConfigChanges: d.applyCapabilityConfigChanges,
}
}
// FindVM resolves an ID-or-name against the store with the historical
// precedence: exact-ID / exact-name first, then unambiguous prefix
// match. Returns an error when no match is found or when a prefix
// matches more than one record.
func (s *VMService) FindVM(ctx context.Context, idOrName string) (model.VMRecord, error) {
if idOrName == "" {
return model.VMRecord{}, errors.New("vm id or name is required")
}
if vm, err := s.store.GetVM(ctx, idOrName); err == nil {
return vm, nil
}
vms, err := s.store.ListVMs(ctx)
if err != nil {
return model.VMRecord{}, err
}
matchCount := 0
var match model.VMRecord
for _, vm := range vms {
if strings.HasPrefix(vm.ID, idOrName) || strings.HasPrefix(vm.Name, idOrName) {
match = vm
matchCount++
}
}
if matchCount == 1 {
return match, nil
}
if matchCount > 1 {
return model.VMRecord{}, fmt.Errorf("multiple VMs match %q", idOrName)
}
return model.VMRecord{}, fmt.Errorf("vm %q not found", idOrName)
}
// TouchVM bumps a VM's updated-at timestamp under the per-VM lock.
func (s *VMService) TouchVM(ctx context.Context, idOrName string) (model.VMRecord, error) {
return s.withVMLockByRef(ctx, idOrName, func(vm model.VMRecord) (model.VMRecord, error) {
system.TouchNow(&vm)
if err := s.store.UpsertVM(ctx, vm); err != nil {
return model.VMRecord{}, err
}
return vm, nil
})
}
// withVMLockByRef resolves idOrName then serialises fn under the
// per-VM lock. Every mutating VM operation funnels through here.
func (s *VMService) withVMLockByRef(ctx context.Context, idOrName string, fn func(model.VMRecord) (model.VMRecord, error)) (model.VMRecord, error) {
vm, err := s.FindVM(ctx, idOrName)
if err != nil {
return model.VMRecord{}, err
}
return s.withVMLockByID(ctx, vm.ID, fn)
}
// withVMLockByID locks on the stable VM ID (so a rename mid-flight
// doesn't drop the lock) and re-reads the record under the lock so
// fn sees the committed state.
func (s *VMService) withVMLockByID(ctx context.Context, id string, fn func(model.VMRecord) (model.VMRecord, error)) (model.VMRecord, error) {
if strings.TrimSpace(id) == "" {
return model.VMRecord{}, errors.New("vm id is required")
}
unlock := s.lockVMID(id)
defer unlock()
vm, err := s.store.GetVMByID(ctx, id)
if err != nil {
if errors.Is(err, sql.ErrNoRows) {
return model.VMRecord{}, fmt.Errorf("vm %q not found", id)
}
return model.VMRecord{}, err
}
return fn(vm)
}
// withVMLockByIDErr is the error-only variant of withVMLockByID for
// callers that don't need the returned record.
func (s *VMService) withVMLockByIDErr(ctx context.Context, id string, fn func(model.VMRecord) error) error {
_, err := s.withVMLockByID(ctx, id, func(vm model.VMRecord) (model.VMRecord, error) {
if err := fn(vm); err != nil {
return model.VMRecord{}, err
}
return vm, nil
})
return err
}
// lockVMID exposes the per-VM mutex for callers that need to hold it
// outside the usual withVMLockByRef/withVMLockByID helpers
// (workspace prepare, for example).
func (s *VMService) lockVMID(id string) func() {
return s.vmLocks.lock(id)
}