Today there's no way to correlate a CLI failure with a daemon log line. operationLog records relative timing but no id, two concurrent vm.start calls log indistinguishably, and the async vmCreateOperationState.ID is user-facing yet never reaches the journal. The root helper logs plain text to stderr while bangerd logs JSON, so a merged journalctl is hard to grep across the trust-boundary split. Mint a per-RPC op id at dispatch entry, store it on context, and include it as an "op_id" attr on every operationLog record. The id is stamped onto every error response (including the early short-circuit paths bad_version and unknown_method). rpc.Call forwards the context op id on requests so a daemon RPC and the helper RPCs it triggers all share one id. The helper now logs JSON to match bangerd, adopts the inbound id, and emits a single "helper rpc completed" / "helper rpc failed" line per call so operators can see at a glance how long each privileged op took. vmCreateOperationState.ID is now the same id dispatch generated for vm.create.begin — one identifier between client status polls, daemon logs, and helper logs. The wire format gains two optional fields: rpc.Request.OpID and rpc.ErrorResponse.OpID, both omitempty so older peers (and the opposite direction) ignore them. ErrorResponse.Error() now appends "(op-XXXXXX)" to its string form when set; existing callers that just print err.Error() get the id for free. Tests cover: dispatch stamps op_id on unknown_method, bad_version, and handler-returned errors; rpc.Call exposes the typed *ErrorResponse via errors.As so the CLI can read code/op_id; ctx op_id is forwarded to the server in the request envelope. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
239 lines
8.4 KiB
Go
239 lines
8.4 KiB
Go
package daemon
|
|
|
|
import (
|
|
"context"
|
|
"database/sql"
|
|
"errors"
|
|
"fmt"
|
|
"log/slog"
|
|
"strings"
|
|
"sync"
|
|
|
|
"banger/internal/daemon/opstate"
|
|
"banger/internal/firecracker"
|
|
"banger/internal/guestconfig"
|
|
"banger/internal/model"
|
|
"banger/internal/paths"
|
|
"banger/internal/store"
|
|
"banger/internal/system"
|
|
)
|
|
|
|
// VMService owns VM lifecycle — create / start / stop / restart /
|
|
// kill / delete / set — plus the handle cache, create-operation
|
|
// registry, stats polling, disk provisioning, ports query, and the
|
|
// SSH-client test seams.
|
|
//
|
|
// It holds pointers to its peer services (HostNetwork, ImageService,
|
|
// WorkspaceService) because VM lifecycle really does orchestrate
|
|
// across them (start needs bridge + tap + firecracker + auth sync +
|
|
// boot). Defining narrow function-typed interfaces for every peer
|
|
// method VMService calls would balloon the diff for no real win —
|
|
// services remain unexported within the package so nothing outside
|
|
// the daemon can see them.
|
|
//
|
|
// Capability dispatch goes through the capHooks seam rather than a
|
|
// *Daemon pointer, so VMService has no path back to the composition
|
|
// root. Daemon.buildCapabilityHooks() populates the seam at wiring
|
|
// time with the registered-capabilities loops from capabilities.go.
|
|
type VMService struct {
|
|
runner system.CommandRunner
|
|
logger *slog.Logger
|
|
config model.DaemonConfig
|
|
layout paths.Layout
|
|
store *store.Store
|
|
|
|
// vmLocks is the per-VM mutex set. Held across entire lifecycle
|
|
// ops (start, stop, delete, set) — not just the validation window.
|
|
// Workspace.prepare intentionally splits off onto its own lock
|
|
// scope; see WorkspaceService.
|
|
vmLocks vmLockSet
|
|
createVMMu sync.Mutex
|
|
createOps opstate.Registry[*vmCreateOperationState]
|
|
|
|
// handles caches per-VM transient kernel/process state (PID, tap,
|
|
// loop devices, DM name/device). Rebuildable at daemon startup
|
|
// from a per-VM handles.json scratch file plus OS inspection.
|
|
handles *handleCache
|
|
|
|
// Peer services. VMService orchestrates across all three during
|
|
// start/stop/delete; pointer fields keep call sites direct without
|
|
// promoting the peer API to package-level interfaces.
|
|
net *HostNetwork
|
|
img *ImageService
|
|
ws *WorkspaceService
|
|
priv privilegedOps
|
|
|
|
// vsockHostDevice is the path preflight + doctor expect to find for
|
|
// the vhost-vsock device. Defaults to defaultVsockHostDevice; tests
|
|
// point at a tempfile so RequireFile passes without needing the
|
|
// real kernel module loaded.
|
|
vsockHostDevice string
|
|
|
|
// Capability hook dispatch. VMService invokes capabilities via
|
|
// these seams, populated by Daemon.buildCapabilityHooks() at
|
|
// wiring time. Capability implementations themselves are
|
|
// structs with explicit service-pointer fields (see capabilities.go);
|
|
// VMService never reaches back to *Daemon.
|
|
capHooks capabilityHooks
|
|
|
|
beginOperation func(ctx context.Context, name string, attrs ...any) *operationLog
|
|
}
|
|
|
|
// capabilityHooks bundles the capability-dispatch entry points that
|
|
// VMService needs. Populated by Daemon.buildCapabilityHooks() at
|
|
// service construction; stubbable in tests that don't care about
|
|
// capability side effects.
|
|
type capabilityHooks struct {
|
|
addStartPrereqs func(ctx context.Context, checks *system.Preflight, vm model.VMRecord, image model.Image)
|
|
contributeGuest func(builder *guestconfig.Builder, vm model.VMRecord, image model.Image)
|
|
contributeMachine func(cfg *firecracker.MachineConfig, vm model.VMRecord, image model.Image)
|
|
prepareHosts func(ctx context.Context, vm *model.VMRecord, image model.Image) error
|
|
postStart func(ctx context.Context, vm model.VMRecord, image model.Image) error
|
|
cleanupState func(ctx context.Context, vm model.VMRecord) error
|
|
applyConfigChanges func(ctx context.Context, before, after model.VMRecord) error
|
|
}
|
|
|
|
type vmServiceDeps struct {
|
|
runner system.CommandRunner
|
|
logger *slog.Logger
|
|
config model.DaemonConfig
|
|
layout paths.Layout
|
|
store *store.Store
|
|
net *HostNetwork
|
|
img *ImageService
|
|
ws *WorkspaceService
|
|
priv privilegedOps
|
|
capHooks capabilityHooks
|
|
beginOperation func(ctx context.Context, name string, attrs ...any) *operationLog
|
|
vsockHostDevice string
|
|
}
|
|
|
|
func newVMService(deps vmServiceDeps) *VMService {
|
|
vsockPath := deps.vsockHostDevice
|
|
if vsockPath == "" {
|
|
vsockPath = defaultVsockHostDevice
|
|
}
|
|
return &VMService{
|
|
runner: deps.runner,
|
|
logger: deps.logger,
|
|
config: deps.config,
|
|
layout: deps.layout,
|
|
store: deps.store,
|
|
net: deps.net,
|
|
img: deps.img,
|
|
ws: deps.ws,
|
|
priv: deps.priv,
|
|
capHooks: deps.capHooks,
|
|
beginOperation: deps.beginOperation,
|
|
vsockHostDevice: vsockPath,
|
|
handles: newHandleCache(),
|
|
}
|
|
}
|
|
|
|
// buildCapabilityHooks adapts Daemon's existing capability-dispatch
|
|
// methods into the capabilityHooks bag VMService takes. Keeps the
|
|
// registry + capability types on *Daemon while letting VMService call
|
|
// into them through explicit function seams.
|
|
func (d *Daemon) buildCapabilityHooks() capabilityHooks {
|
|
return capabilityHooks{
|
|
addStartPrereqs: d.addCapabilityStartPrereqs,
|
|
contributeGuest: d.contributeGuestConfig,
|
|
contributeMachine: d.contributeMachineConfig,
|
|
prepareHosts: d.prepareCapabilityHosts,
|
|
postStart: d.postStartCapabilities,
|
|
cleanupState: d.cleanupCapabilityState,
|
|
applyConfigChanges: d.applyCapabilityConfigChanges,
|
|
}
|
|
}
|
|
|
|
// FindVM resolves an ID-or-name against the store with the historical
|
|
// precedence: exact-ID / exact-name first, then unambiguous prefix
|
|
// match. Returns an error when no match is found or when a prefix
|
|
// matches more than one record.
|
|
func (s *VMService) FindVM(ctx context.Context, idOrName string) (model.VMRecord, error) {
|
|
if idOrName == "" {
|
|
return model.VMRecord{}, errors.New("vm id or name is required")
|
|
}
|
|
if vm, err := s.store.GetVM(ctx, idOrName); err == nil {
|
|
return vm, nil
|
|
}
|
|
vms, err := s.store.ListVMs(ctx)
|
|
if err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
matchCount := 0
|
|
var match model.VMRecord
|
|
for _, vm := range vms {
|
|
if strings.HasPrefix(vm.ID, idOrName) || strings.HasPrefix(vm.Name, idOrName) {
|
|
match = vm
|
|
matchCount++
|
|
}
|
|
}
|
|
if matchCount == 1 {
|
|
return match, nil
|
|
}
|
|
if matchCount > 1 {
|
|
return model.VMRecord{}, fmt.Errorf("multiple VMs match %q", idOrName)
|
|
}
|
|
return model.VMRecord{}, fmt.Errorf("vm %q not found", idOrName)
|
|
}
|
|
|
|
// TouchVM bumps a VM's updated-at timestamp under the per-VM lock.
|
|
func (s *VMService) TouchVM(ctx context.Context, idOrName string) (model.VMRecord, error) {
|
|
return s.withVMLockByRef(ctx, idOrName, func(vm model.VMRecord) (model.VMRecord, error) {
|
|
system.TouchNow(&vm)
|
|
if err := s.store.UpsertVM(ctx, vm); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
return vm, nil
|
|
})
|
|
}
|
|
|
|
// withVMLockByRef resolves idOrName then serialises fn under the
|
|
// per-VM lock. Every mutating VM operation funnels through here.
|
|
func (s *VMService) withVMLockByRef(ctx context.Context, idOrName string, fn func(model.VMRecord) (model.VMRecord, error)) (model.VMRecord, error) {
|
|
vm, err := s.FindVM(ctx, idOrName)
|
|
if err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
return s.withVMLockByID(ctx, vm.ID, fn)
|
|
}
|
|
|
|
// withVMLockByID locks on the stable VM ID (so a rename mid-flight
|
|
// doesn't drop the lock) and re-reads the record under the lock so
|
|
// fn sees the committed state.
|
|
func (s *VMService) withVMLockByID(ctx context.Context, id string, fn func(model.VMRecord) (model.VMRecord, error)) (model.VMRecord, error) {
|
|
if strings.TrimSpace(id) == "" {
|
|
return model.VMRecord{}, errors.New("vm id is required")
|
|
}
|
|
unlock := s.lockVMID(id)
|
|
defer unlock()
|
|
|
|
vm, err := s.store.GetVMByID(ctx, id)
|
|
if err != nil {
|
|
if errors.Is(err, sql.ErrNoRows) {
|
|
return model.VMRecord{}, fmt.Errorf("vm %q not found", id)
|
|
}
|
|
return model.VMRecord{}, err
|
|
}
|
|
return fn(vm)
|
|
}
|
|
|
|
// withVMLockByIDErr is the error-only variant of withVMLockByID for
|
|
// callers that don't need the returned record.
|
|
func (s *VMService) withVMLockByIDErr(ctx context.Context, id string, fn func(model.VMRecord) error) error {
|
|
_, err := s.withVMLockByID(ctx, id, func(vm model.VMRecord) (model.VMRecord, error) {
|
|
if err := fn(vm); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
return vm, nil
|
|
})
|
|
return err
|
|
}
|
|
|
|
// lockVMID exposes the per-VM mutex for callers that need to hold it
|
|
// outside the usual withVMLockByRef/withVMLockByID helpers
|
|
// (workspace prepare, for example).
|
|
func (s *VMService) lockVMID(id string) func() {
|
|
return s.vmLocks.lock(id)
|
|
}
|