Closes commit 3 of the god-service decomposition. VMService still
owned 45+ methods after the startVMLocked extraction and RPC table
landed in commits 1 and 2. Stats / ports / health / vsock-ping sit
in a corner of that surface that doesn't share any state with
lifecycle orchestration — nothing about "what's this VM's CPU
doing" belongs in the same service as Create/Start/Stop/Delete/Set.
New StatsService owns:
- GetVMStats / getVMStatsLocked / collectStats (stats collection)
- HealthVM / PingVM (vsock-agent health probe)
- PortsVM + buildVMPorts + probeWebListener + probeHTTPScheme +
dedupeVMPorts (listening-port enumeration)
- pollStats (background ticker refresh)
- stopStaleVMs (auto-stop sweep past config.AutoStopStaleAfter)
The three VMService touch-points stats genuinely needs — vmAlive,
vmHandles, the per-VM lock helpers, plus cleanupRuntime for the
stale-sweep tear-down — come in as function-typed closures, not a
*VMService pointer. StatsService has no back-reference to its
sibling. Mirrors the dependency-struct pattern WorkspaceService
already uses.
Wiring: d.stats is populated in wireServices AFTER d.vm (closures
must see a non-nil d.vm at call time). Dispatch table's four
entries (vm.stats / vm.health / vm.ping / vm.ports) now resolve
through d.stats. Background loop's pollStats / stopStaleVMs
tickers do the same. Dispatch surface from the RPC client's
perspective is byte-identical.
After this commit:
- vm_stats.go and ports.go are deleted; their content (plus the
stats-specific fields) lives in stats_service.go.
- VMService loses 12 methods. It's still the biggest service
(~30 methods, all lifecycle-supporting: handle cache, disk
provisioning, preflight, create-ops registry, lock helpers,
the lifecycle verbs themselves) but it's finally one coherent
concern instead of five.
Tests:
- TestWireServicesInstantiatesStatsService — pins that the
wiring order puts d.stats non-nil + its five closures all
populated. Prevents a silent background-loop regression.
- All existing tests that called d.vm.HealthVM / d.vm.PingVM /
d.vm.PortsVM / d.vm.collectStats were re-pointed at d.stats.
Smoke: all 21 scenarios green, including vm ports (exercises the
new PortsVM entry end-to-end) and the long-running workspace
scenarios (exercise the background stats poller implicitly).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
477 lines
14 KiB
Go
477 lines
14 KiB
Go
package daemon
|
|
|
|
import (
|
|
"bufio"
|
|
"context"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"log/slog"
|
|
"net"
|
|
"os"
|
|
"sync"
|
|
"time"
|
|
|
|
"banger/internal/config"
|
|
ws "banger/internal/daemon/workspace"
|
|
"banger/internal/model"
|
|
"banger/internal/paths"
|
|
"banger/internal/rpc"
|
|
"banger/internal/store"
|
|
"banger/internal/system"
|
|
"banger/internal/vmdns"
|
|
)
|
|
|
|
// Daemon is the composition root: shared infrastructure (store,
|
|
// runner, logger, layout, config) plus pointers to the four focused
|
|
// services that own behavior. Open wires the services; the dispatch
|
|
// loop forwards RPCs to them. No lifecycle / image / workspace /
|
|
// networking behavior lives on *Daemon itself — it's wiring.
|
|
type Daemon struct {
|
|
layout paths.Layout
|
|
config model.DaemonConfig
|
|
store *store.Store
|
|
runner system.CommandRunner
|
|
logger *slog.Logger
|
|
|
|
net *HostNetwork
|
|
img *ImageService
|
|
ws *WorkspaceService
|
|
vm *VMService
|
|
stats *StatsService
|
|
|
|
closing chan struct{}
|
|
once sync.Once
|
|
pid int
|
|
listener net.Listener
|
|
vmCaps []vmCapability
|
|
requestHandler func(context.Context, rpc.Request) rpc.Response
|
|
guestWaitForSSH func(context.Context, string, string, time.Duration) error
|
|
guestDial func(context.Context, string, string) (guestSSHClient, error)
|
|
}
|
|
|
|
func Open(ctx context.Context) (d *Daemon, err error) {
|
|
layout, err := paths.Resolve()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if err := paths.Ensure(layout); err != nil {
|
|
return nil, err
|
|
}
|
|
cfg, err := config.Load(layout)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
logger, normalizedLevel, err := newDaemonLogger(os.Stderr, cfg.LogLevel)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
cfg.LogLevel = normalizedLevel
|
|
db, err := store.Open(layout.DBPath)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
closing := make(chan struct{})
|
|
runner := system.NewRunner()
|
|
d = &Daemon{
|
|
layout: layout,
|
|
config: cfg,
|
|
store: db,
|
|
runner: runner,
|
|
logger: logger,
|
|
closing: closing,
|
|
pid: os.Getpid(),
|
|
}
|
|
wireServices(d)
|
|
// From here on, every failure path must run Close() so the host
|
|
// state we touched (DNS listener goroutine, resolvectl routing,
|
|
// SQLite handle, future side effects) gets unwound. Close is
|
|
// idempotent + nil-guarded so it's safe to call on a partially
|
|
// initialised daemon — `d.vmDNS == nil` and friends short-circuit
|
|
// the teardown of components we never set up.
|
|
defer func() {
|
|
if err != nil {
|
|
_ = d.Close()
|
|
}
|
|
}()
|
|
|
|
d.ensureVMSSHClientConfig()
|
|
d.logger.Info("daemon opened", "socket", layout.SocketPath, "state_dir", layout.StateDir, "log_level", cfg.LogLevel)
|
|
if err = d.net.startVMDNS(vmdns.DefaultListenAddr); err != nil {
|
|
d.logger.Error("daemon open failed", "stage", "start_vm_dns", "error", err.Error())
|
|
return nil, err
|
|
}
|
|
if err = d.reconcile(ctx); err != nil {
|
|
d.logger.Error("daemon open failed", "stage", "reconcile", "error", err.Error())
|
|
return nil, err
|
|
}
|
|
d.net.ensureVMDNSResolverRouting(ctx)
|
|
// Seed HostNetwork's pool index from taps already claimed by VMs
|
|
// on disk so newly warmed pool entries don't collide with them.
|
|
if d.config.TapPoolSize > 0 && d.store != nil {
|
|
vms, listErr := d.store.ListVMs(ctx)
|
|
if listErr != nil {
|
|
d.logger.Error("daemon open failed", "stage", "initialize_tap_pool", "error", listErr.Error())
|
|
return nil, listErr
|
|
}
|
|
used := make([]string, 0, len(vms))
|
|
for _, vm := range vms {
|
|
if tap := d.vm.vmHandles(vm.ID).TapDevice; tap != "" {
|
|
used = append(used, tap)
|
|
}
|
|
}
|
|
d.net.initializeTapPool(used)
|
|
}
|
|
go d.net.ensureTapPool(context.Background())
|
|
return d, nil
|
|
}
|
|
|
|
func (d *Daemon) Close() error {
|
|
var err error
|
|
d.once.Do(func() {
|
|
if d.logger != nil {
|
|
d.logger.Info("daemon closing")
|
|
}
|
|
close(d.closing)
|
|
if d.listener != nil {
|
|
_ = d.listener.Close()
|
|
}
|
|
var closeErr error
|
|
if d.store != nil {
|
|
closeErr = d.store.Close()
|
|
}
|
|
err = errors.Join(d.net.clearVMDNSResolverRouting(context.Background()), d.net.stopVMDNS(), closeErr)
|
|
})
|
|
return err
|
|
}
|
|
|
|
func (d *Daemon) Serve(ctx context.Context) error {
|
|
_ = os.Remove(d.layout.SocketPath)
|
|
listener, err := net.Listen("unix", d.layout.SocketPath)
|
|
if err != nil {
|
|
if d.logger != nil {
|
|
d.logger.Error("daemon listen failed", "socket", d.layout.SocketPath, "error", err.Error())
|
|
}
|
|
return err
|
|
}
|
|
d.listener = listener
|
|
defer listener.Close()
|
|
defer os.Remove(d.layout.SocketPath)
|
|
if err := os.Chmod(d.layout.SocketPath, 0o600); err != nil {
|
|
return err
|
|
}
|
|
if d.logger != nil {
|
|
d.logger.Info("daemon serving", "socket", d.layout.SocketPath, "pid", d.pid)
|
|
}
|
|
go d.backgroundLoop()
|
|
|
|
for {
|
|
conn, err := listener.Accept()
|
|
if err != nil {
|
|
select {
|
|
case <-ctx.Done():
|
|
return nil
|
|
case <-d.closing:
|
|
return nil
|
|
default:
|
|
}
|
|
if _, ok := err.(net.Error); ok {
|
|
if d.logger != nil {
|
|
d.logger.Warn("daemon accept temporary failure", "error", err.Error())
|
|
}
|
|
time.Sleep(100 * time.Millisecond)
|
|
continue
|
|
}
|
|
if d.logger != nil {
|
|
d.logger.Error("daemon accept failed", "error", err.Error())
|
|
}
|
|
return err
|
|
}
|
|
go d.handleConn(conn)
|
|
}
|
|
}
|
|
|
|
func (d *Daemon) handleConn(conn net.Conn) {
|
|
defer conn.Close()
|
|
reader := bufio.NewReader(conn)
|
|
var req rpc.Request
|
|
if err := json.NewDecoder(reader).Decode(&req); err != nil {
|
|
if d.logger != nil {
|
|
d.logger.Warn("daemon request decode failed", "remote", conn.RemoteAddr().String(), "error", err.Error())
|
|
}
|
|
_ = json.NewEncoder(conn).Encode(rpc.NewError("bad_request", err.Error()))
|
|
return
|
|
}
|
|
reqCtx, cancel := context.WithCancel(context.Background())
|
|
defer cancel()
|
|
stopWatch := d.watchRequestDisconnect(conn, reader, req.Method, cancel)
|
|
defer stopWatch()
|
|
resp := d.dispatch(reqCtx, req)
|
|
if reqCtx.Err() != nil {
|
|
return
|
|
}
|
|
if err := json.NewEncoder(conn).Encode(resp); err != nil && d.logger != nil {
|
|
d.logger.Warn("daemon response encode failed", "method", req.Method, "remote", conn.RemoteAddr().String(), "error", err.Error())
|
|
}
|
|
}
|
|
|
|
func (d *Daemon) watchRequestDisconnect(conn net.Conn, reader *bufio.Reader, method string, cancel context.CancelFunc) func() {
|
|
if conn == nil || reader == nil {
|
|
return func() {}
|
|
}
|
|
done := make(chan struct{})
|
|
var once sync.Once
|
|
go func() {
|
|
go func() {
|
|
<-done
|
|
if deadlineSetter, ok := conn.(interface{ SetReadDeadline(time.Time) error }); ok {
|
|
_ = deadlineSetter.SetReadDeadline(time.Now())
|
|
}
|
|
}()
|
|
var buf [1]byte
|
|
for {
|
|
_, err := reader.Read(buf[:])
|
|
if err == nil {
|
|
continue
|
|
}
|
|
select {
|
|
case <-done:
|
|
return
|
|
default:
|
|
}
|
|
if d.logger != nil {
|
|
d.logger.Info("daemon request canceled", "method", method, "remote", conn.RemoteAddr().String(), "error", err.Error())
|
|
}
|
|
cancel()
|
|
return
|
|
}
|
|
}()
|
|
return func() {
|
|
once.Do(func() {
|
|
close(done)
|
|
})
|
|
}
|
|
}
|
|
|
|
func (d *Daemon) dispatch(ctx context.Context, req rpc.Request) rpc.Response {
|
|
if req.Version != rpc.Version {
|
|
return rpc.NewError("bad_version", fmt.Sprintf("unsupported version %d", req.Version))
|
|
}
|
|
if d.requestHandler != nil {
|
|
return d.requestHandler(ctx, req)
|
|
}
|
|
h, ok := rpcHandlers[req.Method]
|
|
if !ok {
|
|
return rpc.NewError("unknown_method", req.Method)
|
|
}
|
|
return h(ctx, d, req)
|
|
}
|
|
|
|
func (d *Daemon) backgroundLoop() {
|
|
statsTicker := time.NewTicker(d.config.StatsPollInterval)
|
|
staleTicker := time.NewTicker(model.DefaultStaleSweepInterval)
|
|
defer statsTicker.Stop()
|
|
defer staleTicker.Stop()
|
|
for {
|
|
select {
|
|
case <-d.closing:
|
|
return
|
|
case <-statsTicker.C:
|
|
if err := d.stats.pollStats(context.Background()); err != nil && d.logger != nil {
|
|
d.logger.Error("background stats poll failed", "error", err.Error())
|
|
}
|
|
case <-staleTicker.C:
|
|
if err := d.stats.stopStaleVMs(context.Background()); err != nil && d.logger != nil {
|
|
d.logger.Error("background stale sweep failed", "error", err.Error())
|
|
}
|
|
d.vm.pruneVMCreateOperations(time.Now().Add(-10 * time.Minute))
|
|
}
|
|
}
|
|
}
|
|
|
|
func (d *Daemon) reconcile(ctx context.Context) error {
|
|
op := d.beginOperation("daemon.reconcile")
|
|
vms, err := d.store.ListVMs(ctx)
|
|
if err != nil {
|
|
return op.fail(err)
|
|
}
|
|
for _, vm := range vms {
|
|
if err := d.vm.withVMLockByIDErr(ctx, vm.ID, func(vm model.VMRecord) error {
|
|
if vm.State != model.VMStateRunning {
|
|
// Belt-and-braces: a stopped VM should never have a
|
|
// scratch file or a cache entry. Clean up anything
|
|
// left by an ungraceful previous daemon crash.
|
|
d.vm.clearVMHandles(vm)
|
|
return nil
|
|
}
|
|
// Rebuild the in-memory handle cache by loading the per-VM
|
|
// scratch file and verifying the firecracker process is
|
|
// still alive.
|
|
h, alive, err := d.vm.rediscoverHandles(ctx, vm)
|
|
if err != nil && d.logger != nil {
|
|
d.logger.Warn("rediscover handles failed", "vm_id", vm.ID, "error", err.Error())
|
|
}
|
|
// Either way, seed the cache with what the scratch file
|
|
// claimed. If alive, subsequent vmAlive() calls pass; if
|
|
// not, cleanupRuntime needs these handles to know which
|
|
// kernel resources (DM / loops / tap) to tear down.
|
|
d.vm.setVMHandlesInMemory(vm.ID, h)
|
|
if alive {
|
|
return nil
|
|
}
|
|
op.stage("stale_vm", vmLogAttrs(vm)...)
|
|
_ = d.vm.cleanupRuntime(ctx, vm, true)
|
|
vm.State = model.VMStateStopped
|
|
vm.Runtime.State = model.VMStateStopped
|
|
vm.Runtime.TapDevice = ""
|
|
d.vm.clearVMHandles(vm)
|
|
vm.UpdatedAt = model.Now()
|
|
return d.store.UpsertVM(ctx, vm)
|
|
}); err != nil {
|
|
return op.fail(err, "vm_id", vm.ID)
|
|
}
|
|
}
|
|
if err := d.vm.rebuildDNS(ctx); err != nil {
|
|
return op.fail(err)
|
|
}
|
|
op.done()
|
|
return nil
|
|
}
|
|
|
|
// FindVM stays on Daemon as a thin forwarder to the VM service lookup.
|
|
// Dispatch code reads the facade directly; tests that pre-date the
|
|
// service split keep compiling.
|
|
func (d *Daemon) FindVM(ctx context.Context, idOrName string) (model.VMRecord, error) {
|
|
return d.vm.FindVM(ctx, idOrName)
|
|
}
|
|
|
|
// FindImage stays on Daemon as a thin forwarder to the image service
|
|
// lookup so callers reading dispatch code see the obvious facade, and
|
|
// tests that pre-date the service split still compile.
|
|
func (d *Daemon) FindImage(ctx context.Context, idOrName string) (model.Image, error) {
|
|
return d.img.FindImage(ctx, idOrName)
|
|
}
|
|
|
|
func (d *Daemon) TouchVM(ctx context.Context, idOrName string) (model.VMRecord, error) {
|
|
return d.vm.TouchVM(ctx, idOrName)
|
|
}
|
|
|
|
// wireServices populates the four focused services and their peer
|
|
// references from the infrastructure already on d (runner, logger,
|
|
// config, layout, store, closing, plus the SSH-client test seams).
|
|
// Idempotent: each service is skipped if the field is already non-nil,
|
|
// so tests can preinstall stubs for the services they want to fake and
|
|
// let wireServices fill the rest. The peer-service closures on
|
|
// WorkspaceService capture d rather than a direct *VMService pointer so
|
|
// the ws↔vm construction order doesn't recurse: the closures read d.vm
|
|
// at call time, by which point it is populated.
|
|
func wireServices(d *Daemon) {
|
|
if d.net == nil {
|
|
d.net = newHostNetwork(hostNetworkDeps{
|
|
runner: d.runner,
|
|
logger: d.logger,
|
|
config: d.config,
|
|
layout: d.layout,
|
|
closing: d.closing,
|
|
})
|
|
}
|
|
if d.img == nil {
|
|
d.img = newImageService(imageServiceDeps{
|
|
runner: d.runner,
|
|
logger: d.logger,
|
|
config: d.config,
|
|
layout: d.layout,
|
|
store: d.store,
|
|
beginOperation: func(name string, attrs ...any) *operationLog {
|
|
return d.beginOperation(name, attrs...)
|
|
},
|
|
})
|
|
}
|
|
if d.ws == nil {
|
|
d.ws = newWorkspaceService(workspaceServiceDeps{
|
|
runner: d.runner,
|
|
logger: d.logger,
|
|
config: d.config,
|
|
layout: d.layout,
|
|
store: d.store,
|
|
repoInspector: ws.NewInspector(),
|
|
vmResolver: func(ctx context.Context, idOrName string) (model.VMRecord, error) {
|
|
return d.vm.FindVM(ctx, idOrName)
|
|
},
|
|
aliveChecker: func(vm model.VMRecord) bool {
|
|
return d.vm.vmAlive(vm)
|
|
},
|
|
waitGuestSSH: d.waitForGuestSSH,
|
|
dialGuest: d.dialGuest,
|
|
imageResolver: func(ctx context.Context, idOrName string) (model.Image, error) {
|
|
return d.FindImage(ctx, idOrName)
|
|
},
|
|
imageWorkSeed: func(ctx context.Context, image model.Image, fingerprint string) error {
|
|
return d.img.refreshManagedWorkSeedFingerprint(ctx, image, fingerprint)
|
|
},
|
|
withVMLockByRef: func(ctx context.Context, idOrName string, fn func(model.VMRecord) (model.VMRecord, error)) (model.VMRecord, error) {
|
|
return d.vm.withVMLockByRef(ctx, idOrName, fn)
|
|
},
|
|
beginOperation: d.beginOperation,
|
|
})
|
|
}
|
|
if d.vm == nil {
|
|
d.vm = newVMService(vmServiceDeps{
|
|
runner: d.runner,
|
|
logger: d.logger,
|
|
config: d.config,
|
|
layout: d.layout,
|
|
store: d.store,
|
|
net: d.net,
|
|
img: d.img,
|
|
ws: d.ws,
|
|
capHooks: d.buildCapabilityHooks(),
|
|
beginOperation: d.beginOperation,
|
|
vsockHostDevice: defaultVsockHostDevice,
|
|
})
|
|
}
|
|
if d.stats == nil {
|
|
// Closures capture d rather than d.vm directly, so they re-read
|
|
// d.vm at call time. Wire order (d.vm constructed above) makes
|
|
// the closures safe, but this pattern also protects against a
|
|
// future test that swaps d.vm after initial wire.
|
|
d.stats = newStatsService(statsServiceDeps{
|
|
runner: d.runner,
|
|
logger: d.logger,
|
|
config: d.config,
|
|
store: d.store,
|
|
net: d.net,
|
|
beginOperation: d.beginOperation,
|
|
vmAlive: func(vm model.VMRecord) bool { return d.vm.vmAlive(vm) },
|
|
vmHandles: func(id string) model.VMHandles { return d.vm.vmHandles(id) },
|
|
withVMLockByRef: func(ctx context.Context, idOrName string, fn func(model.VMRecord) (model.VMRecord, error)) (model.VMRecord, error) {
|
|
return d.vm.withVMLockByRef(ctx, idOrName, fn)
|
|
},
|
|
withVMLockByIDErr: func(ctx context.Context, id string, fn func(model.VMRecord) error) error {
|
|
return d.vm.withVMLockByIDErr(ctx, id, fn)
|
|
},
|
|
cleanupRuntime: func(ctx context.Context, vm model.VMRecord, preserve bool) error {
|
|
return d.vm.cleanupRuntime(ctx, vm, preserve)
|
|
},
|
|
})
|
|
}
|
|
if len(d.vmCaps) == 0 {
|
|
d.vmCaps = d.defaultCapabilities()
|
|
}
|
|
}
|
|
|
|
func marshalResultOrError(v any, err error) rpc.Response {
|
|
if err != nil {
|
|
return rpc.NewError("operation_failed", err.Error())
|
|
}
|
|
resp, marshalErr := rpc.NewResult(v)
|
|
if marshalErr != nil {
|
|
return rpc.NewError("marshal_failed", marshalErr.Error())
|
|
}
|
|
return resp
|
|
}
|
|
|
|
func exists(path string) bool {
|
|
_, err := os.Stat(path)
|
|
return err == nil
|
|
}
|