daemon split (4/5): extract *VMService service

Phase 4 of the daemon god-struct refactor. VM lifecycle, create-op
registry, handle cache, disk provisioning, stats polling, ports
query, and the per-VM lock set all move off *Daemon onto *VMService.

Daemon keeps thin forwarders only for FindVM / TouchVM (dispatch
surface) and is otherwise out of VM lifecycle. Lazy-init via
d.vmSvc() mirrors the earlier services so test literals like
\`&Daemon{store: db, runner: r}\` still get a functional service
without spelling one out.

Three small cleanups along the way:

  * preflight helpers (validateStartPrereqs / addBaseStartPrereqs
    / addBaseStartCommandPrereqs / validateWorkDiskResizePrereqs)
    move with the VM methods that call them.
  * cleanupRuntime / rebuildDNS move to *VMService, with
    HostNetwork primitives (findFirecrackerPID, cleanupDMSnapshot,
    killVMProcess, releaseTap, waitForExit, sendCtrlAltDel)
    reached through s.net instead of the hostNet() facade.
  * vsockAgentBinary becomes a package-level function so both
    *Daemon (doctor) and *VMService (preflight) call one entry
    point instead of each owning a forwarder method.

WorkspaceService's peer deps switch from eager method values to
closures — vmSvc() constructs VMService with WorkspaceService as a
peer, so resolving d.vmSvc().FindVM at construction time recursed
through workspaceSvc() → vmSvc(). Closures defer the lookup to call
time.

Pure code motion: build + unit tests green, lint clean. No RPC
surface or lock-ordering changes.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Thales Maciel 2026-04-20 20:57:05 -03:00
parent c0d456e734
commit 466a7c30c4
No known key found for this signature in database
GPG key ID: 33112E6833C34679
23 changed files with 655 additions and 463 deletions

View file

@ -16,24 +16,24 @@ import (
"banger/internal/system"
)
func (d *Daemon) StartVM(ctx context.Context, idOrName string) (model.VMRecord, error) {
return d.withVMLockByRef(ctx, idOrName, func(vm model.VMRecord) (model.VMRecord, error) {
image, err := d.store.GetImageByID(ctx, vm.ImageID)
func (s *VMService) StartVM(ctx context.Context, idOrName string) (model.VMRecord, error) {
return s.withVMLockByRef(ctx, idOrName, func(vm model.VMRecord) (model.VMRecord, error) {
image, err := s.store.GetImageByID(ctx, vm.ImageID)
if err != nil {
return model.VMRecord{}, err
}
if d.vmAlive(vm) {
if d.logger != nil {
d.logger.Info("vm already running", vmLogAttrs(vm)...)
if s.vmAlive(vm) {
if s.logger != nil {
s.logger.Info("vm already running", vmLogAttrs(vm)...)
}
return vm, nil
}
return d.startVMLocked(ctx, vm, image)
return s.startVMLocked(ctx, vm, image)
})
}
func (d *Daemon) startVMLocked(ctx context.Context, vm model.VMRecord, image model.Image) (_ model.VMRecord, err error) {
op := d.beginOperation("vm.start", append(vmLogAttrs(vm), imageLogAttrs(image)...)...)
func (s *VMService) startVMLocked(ctx context.Context, vm model.VMRecord, image model.Image) (_ model.VMRecord, err error) {
op := s.beginOperation("vm.start", append(vmLogAttrs(vm), imageLogAttrs(image)...)...)
defer func() {
if err != nil {
err = annotateLogPath(err, vm.Runtime.LogPath)
@ -44,32 +44,32 @@ func (d *Daemon) startVMLocked(ctx context.Context, vm model.VMRecord, image mod
}()
op.stage("preflight")
vmCreateStage(ctx, "preflight", "checking host prerequisites")
if err := d.validateStartPrereqs(ctx, vm, image); err != nil {
if err := s.validateStartPrereqs(ctx, vm, image); err != nil {
return model.VMRecord{}, err
}
if err := os.MkdirAll(vm.Runtime.VMDir, 0o755); err != nil {
return model.VMRecord{}, err
}
op.stage("cleanup_runtime")
if err := d.cleanupRuntime(ctx, vm, true); err != nil {
if err := s.cleanupRuntime(ctx, vm, true); err != nil {
return model.VMRecord{}, err
}
d.clearVMHandles(vm)
s.clearVMHandles(vm)
op.stage("bridge")
if err := d.hostNet().ensureBridge(ctx); err != nil {
if err := s.net.ensureBridge(ctx); err != nil {
return model.VMRecord{}, err
}
op.stage("socket_dir")
if err := d.hostNet().ensureSocketDir(); err != nil {
if err := s.net.ensureSocketDir(); err != nil {
return model.VMRecord{}, err
}
shortID := system.ShortID(vm.ID)
apiSock := filepath.Join(d.layout.RuntimeDir, "fc-"+shortID+".sock")
apiSock := filepath.Join(s.layout.RuntimeDir, "fc-"+shortID+".sock")
dmName := "fc-rootfs-" + shortID
tapName := "tap-fc-" + shortID
if strings.TrimSpace(vm.Runtime.VSockPath) == "" {
vm.Runtime.VSockPath = defaultVSockPath(d.layout.RuntimeDir, vm.ID)
vm.Runtime.VSockPath = defaultVSockPath(s.layout.RuntimeDir, vm.ID)
}
if vm.Runtime.VSockCID == 0 {
vm.Runtime.VSockCID, err = defaultVSockCID(vm.Runtime.GuestIP)
@ -86,13 +86,13 @@ func (d *Daemon) startVMLocked(ctx context.Context, vm model.VMRecord, image mod
op.stage("system_overlay", "overlay_path", vm.Runtime.SystemOverlay)
vmCreateStage(ctx, "prepare_rootfs", "preparing system overlay")
if err := d.ensureSystemOverlay(ctx, &vm); err != nil {
if err := s.ensureSystemOverlay(ctx, &vm); err != nil {
return model.VMRecord{}, err
}
op.stage("dm_snapshot", "dm_name", dmName)
vmCreateStage(ctx, "prepare_rootfs", "creating root filesystem snapshot")
snapHandles, err := d.hostNet().createDMSnapshot(ctx, image.RootfsPath, vm.Runtime.SystemOverlay, dmName)
snapHandles, err := s.net.createDMSnapshot(ctx, image.RootfsPath, vm.Runtime.SystemOverlay, dmName)
if err != nil {
return model.VMRecord{}, err
}
@ -107,7 +107,7 @@ func (d *Daemon) startVMLocked(ctx context.Context, vm model.VMRecord, image mod
DMName: snapHandles.DMName,
DMDev: snapHandles.DMDev,
}
d.setVMHandles(vm, live)
s.setVMHandles(vm, live)
vm.Runtime.APISockPath = apiSock
vm.Runtime.State = model.VMStateRunning
@ -119,38 +119,38 @@ func (d *Daemon) startVMLocked(ctx context.Context, vm model.VMRecord, image mod
vm.Runtime.State = model.VMStateError
vm.Runtime.LastError = err.Error()
op.stage("cleanup_after_failure", "error", err.Error())
if cleanupErr := d.cleanupRuntime(context.Background(), vm, true); cleanupErr != nil {
if cleanupErr := s.cleanupRuntime(context.Background(), vm, true); cleanupErr != nil {
err = errors.Join(err, cleanupErr)
}
d.clearVMHandles(vm)
_ = d.store.UpsertVM(context.Background(), vm)
s.clearVMHandles(vm)
_ = s.store.UpsertVM(context.Background(), vm)
return model.VMRecord{}, err
}
op.stage("patch_root_overlay")
vmCreateStage(ctx, "prepare_rootfs", "writing guest configuration")
if err := d.patchRootOverlay(ctx, vm, image); err != nil {
if err := s.patchRootOverlay(ctx, vm, image); err != nil {
return cleanupOnErr(err)
}
op.stage("prepare_host_features")
vmCreateStage(ctx, "prepare_host_features", "preparing host-side vm features")
if err := d.prepareCapabilityHosts(ctx, &vm, image); err != nil {
if err := s.capHooks.prepareHosts(ctx, &vm, image); err != nil {
return cleanupOnErr(err)
}
op.stage("tap")
tap, err := d.hostNet().acquireTap(ctx, tapName)
tap, err := s.net.acquireTap(ctx, tapName)
if err != nil {
return cleanupOnErr(err)
}
live.TapDevice = tap
d.setVMHandles(vm, live)
s.setVMHandles(vm, live)
op.stage("metrics_file", "metrics_path", vm.Runtime.MetricsPath)
if err := os.WriteFile(vm.Runtime.MetricsPath, nil, 0o644); err != nil {
return cleanupOnErr(err)
}
op.stage("firecracker_binary")
fcPath, err := d.hostNet().firecrackerBinary()
fcPath, err := s.net.firecrackerBinary()
if err != nil {
return cleanupOnErr(err)
}
@ -165,7 +165,7 @@ func (d *Daemon) startVMLocked(ctx context.Context, vm model.VMRecord, image mod
// 2. init= pointing at our universal wrapper which installs
// systemd+sshd on first boot if missing.
kernelArgs = system.BuildBootArgsWithKernelIP(
vm.Name, vm.Runtime.GuestIP, d.config.BridgeIP, d.config.DefaultDNS,
vm.Name, vm.Runtime.GuestIP, s.config.BridgeIP, s.config.DefaultDNS,
) + " init=" + imagepull.FirstBootScriptPath
}
@ -189,9 +189,9 @@ func (d *Daemon) startVMLocked(ctx context.Context, vm model.VMRecord, image mod
VSockCID: vm.Runtime.VSockCID,
VCPUCount: vm.Spec.VCPUCount,
MemoryMiB: vm.Spec.MemoryMiB,
Logger: d.logger,
Logger: s.logger,
}
d.contributeMachineConfig(&machineConfig, vm, image)
s.capHooks.contributeMachine(&machineConfig, vm, image)
machine, err := firecracker.NewMachine(ctx, machineConfig)
if err != nil {
return cleanupOnErr(err)
@ -200,48 +200,48 @@ func (d *Daemon) startVMLocked(ctx context.Context, vm model.VMRecord, image mod
// Use a fresh context: the request ctx may already be cancelled (client
// disconnect), but we still need the PID so cleanupRuntime can kill the
// Firecracker process that was spawned before the failure.
live.PID = d.hostNet().resolveFirecrackerPID(context.Background(), machine, apiSock)
d.setVMHandles(vm, live)
live.PID = s.net.resolveFirecrackerPID(context.Background(), machine, apiSock)
s.setVMHandles(vm, live)
return cleanupOnErr(err)
}
live.PID = d.hostNet().resolveFirecrackerPID(context.Background(), machine, apiSock)
d.setVMHandles(vm, live)
live.PID = s.net.resolveFirecrackerPID(context.Background(), machine, apiSock)
s.setVMHandles(vm, live)
op.debugStage("firecracker_started", "pid", live.PID)
op.stage("socket_access", "api_socket", apiSock)
if err := d.hostNet().ensureSocketAccess(ctx, apiSock, "firecracker api socket"); err != nil {
if err := s.net.ensureSocketAccess(ctx, apiSock, "firecracker api socket"); err != nil {
return cleanupOnErr(err)
}
op.stage("vsock_access", "vsock_path", vm.Runtime.VSockPath, "vsock_cid", vm.Runtime.VSockCID)
if err := d.hostNet().ensureSocketAccess(ctx, vm.Runtime.VSockPath, "firecracker vsock socket"); err != nil {
if err := s.net.ensureSocketAccess(ctx, vm.Runtime.VSockPath, "firecracker vsock socket"); err != nil {
return cleanupOnErr(err)
}
vmCreateStage(ctx, "wait_vsock_agent", "waiting for guest vsock agent")
if err := d.hostNet().waitForGuestVSockAgent(ctx, vm.Runtime.VSockPath, vsockReadyWait); err != nil {
if err := s.net.waitForGuestVSockAgent(ctx, vm.Runtime.VSockPath, vsockReadyWait); err != nil {
return cleanupOnErr(err)
}
op.stage("post_start_features")
vmCreateStage(ctx, "wait_guest_ready", "waiting for guest services")
if err := d.postStartCapabilities(ctx, vm, image); err != nil {
if err := s.capHooks.postStart(ctx, vm, image); err != nil {
return cleanupOnErr(err)
}
system.TouchNow(&vm)
op.stage("persist")
vmCreateStage(ctx, "finalize", "saving vm state")
if err := d.store.UpsertVM(ctx, vm); err != nil {
if err := s.store.UpsertVM(ctx, vm); err != nil {
return cleanupOnErr(err)
}
return vm, nil
}
func (d *Daemon) StopVM(ctx context.Context, idOrName string) (model.VMRecord, error) {
return d.withVMLockByRef(ctx, idOrName, func(vm model.VMRecord) (model.VMRecord, error) {
return d.stopVMLocked(ctx, vm)
func (s *VMService) StopVM(ctx context.Context, idOrName string) (model.VMRecord, error) {
return s.withVMLockByRef(ctx, idOrName, func(vm model.VMRecord) (model.VMRecord, error) {
return s.stopVMLocked(ctx, vm)
})
}
func (d *Daemon) stopVMLocked(ctx context.Context, current model.VMRecord) (vm model.VMRecord, err error) {
func (s *VMService) stopVMLocked(ctx context.Context, current model.VMRecord) (vm model.VMRecord, err error) {
vm = current
op := d.beginOperation("vm.stop", "vm_ref", vm.ID)
op := s.beginOperation("vm.stop", "vm_ref", vm.ID)
defer func() {
if err != nil {
op.fail(err, vmLogAttrs(vm)...)
@ -249,54 +249,54 @@ func (d *Daemon) stopVMLocked(ctx context.Context, current model.VMRecord) (vm m
}
op.done(vmLogAttrs(vm)...)
}()
if !d.vmAlive(vm) {
if !s.vmAlive(vm) {
op.stage("cleanup_stale_runtime")
if err := d.cleanupRuntime(ctx, vm, true); err != nil {
if err := s.cleanupRuntime(ctx, vm, true); err != nil {
return model.VMRecord{}, err
}
vm.State = model.VMStateStopped
vm.Runtime.State = model.VMStateStopped
d.clearVMHandles(vm)
if err := d.store.UpsertVM(ctx, vm); err != nil {
s.clearVMHandles(vm)
if err := s.store.UpsertVM(ctx, vm); err != nil {
return model.VMRecord{}, err
}
return vm, nil
}
pid := d.vmHandles(vm.ID).PID
pid := s.vmHandles(vm.ID).PID
op.stage("graceful_shutdown")
if err := d.hostNet().sendCtrlAltDel(ctx, vm.Runtime.APISockPath); err != nil {
if err := s.net.sendCtrlAltDel(ctx, vm.Runtime.APISockPath); err != nil {
return model.VMRecord{}, err
}
op.stage("wait_for_exit", "pid", pid)
if err := d.hostNet().waitForExit(ctx, pid, vm.Runtime.APISockPath, gracefulShutdownWait); err != nil {
if err := s.net.waitForExit(ctx, pid, vm.Runtime.APISockPath, gracefulShutdownWait); err != nil {
if !errors.Is(err, errWaitForExitTimeout) {
return model.VMRecord{}, err
}
op.stage("graceful_shutdown_timeout", "pid", pid)
}
op.stage("cleanup_runtime")
if err := d.cleanupRuntime(ctx, vm, true); err != nil {
if err := s.cleanupRuntime(ctx, vm, true); err != nil {
return model.VMRecord{}, err
}
vm.State = model.VMStateStopped
vm.Runtime.State = model.VMStateStopped
d.clearVMHandles(vm)
s.clearVMHandles(vm)
system.TouchNow(&vm)
if err := d.store.UpsertVM(ctx, vm); err != nil {
if err := s.store.UpsertVM(ctx, vm); err != nil {
return model.VMRecord{}, err
}
return vm, nil
}
func (d *Daemon) KillVM(ctx context.Context, params api.VMKillParams) (model.VMRecord, error) {
return d.withVMLockByRef(ctx, params.IDOrName, func(vm model.VMRecord) (model.VMRecord, error) {
return d.killVMLocked(ctx, vm, params.Signal)
func (s *VMService) KillVM(ctx context.Context, params api.VMKillParams) (model.VMRecord, error) {
return s.withVMLockByRef(ctx, params.IDOrName, func(vm model.VMRecord) (model.VMRecord, error) {
return s.killVMLocked(ctx, vm, params.Signal)
})
}
func (d *Daemon) killVMLocked(ctx context.Context, current model.VMRecord, signalValue string) (vm model.VMRecord, err error) {
func (s *VMService) killVMLocked(ctx context.Context, current model.VMRecord, signalValue string) (vm model.VMRecord, err error) {
vm = current
op := d.beginOperation("vm.kill", "vm_ref", vm.ID, "signal", signalValue)
op := s.beginOperation("vm.kill", "vm_ref", vm.ID, "signal", signalValue)
defer func() {
if err != nil {
op.fail(err, vmLogAttrs(vm)...)
@ -304,15 +304,15 @@ func (d *Daemon) killVMLocked(ctx context.Context, current model.VMRecord, signa
}
op.done(vmLogAttrs(vm)...)
}()
if !d.vmAlive(vm) {
if !s.vmAlive(vm) {
op.stage("cleanup_stale_runtime")
if err := d.cleanupRuntime(ctx, vm, true); err != nil {
if err := s.cleanupRuntime(ctx, vm, true); err != nil {
return model.VMRecord{}, err
}
vm.State = model.VMStateStopped
vm.Runtime.State = model.VMStateStopped
d.clearVMHandles(vm)
if err := d.store.UpsertVM(ctx, vm); err != nil {
s.clearVMHandles(vm)
if err := s.store.UpsertVM(ctx, vm); err != nil {
return model.VMRecord{}, err
}
return vm, nil
@ -322,34 +322,34 @@ func (d *Daemon) killVMLocked(ctx context.Context, current model.VMRecord, signa
if signal == "" {
signal = "TERM"
}
pid := d.vmHandles(vm.ID).PID
pid := s.vmHandles(vm.ID).PID
op.stage("send_signal", "pid", pid, "signal", signal)
if _, err := d.runner.RunSudo(ctx, "kill", "-"+signal, strconv.Itoa(pid)); err != nil {
if _, err := s.runner.RunSudo(ctx, "kill", "-"+signal, strconv.Itoa(pid)); err != nil {
return model.VMRecord{}, err
}
op.stage("wait_for_exit", "pid", pid)
if err := d.hostNet().waitForExit(ctx, pid, vm.Runtime.APISockPath, 30*time.Second); err != nil {
if err := s.net.waitForExit(ctx, pid, vm.Runtime.APISockPath, 30*time.Second); err != nil {
if !errors.Is(err, errWaitForExitTimeout) {
return model.VMRecord{}, err
}
op.stage("signal_timeout", "pid", pid, "signal", signal)
}
op.stage("cleanup_runtime")
if err := d.cleanupRuntime(ctx, vm, true); err != nil {
if err := s.cleanupRuntime(ctx, vm, true); err != nil {
return model.VMRecord{}, err
}
vm.State = model.VMStateStopped
vm.Runtime.State = model.VMStateStopped
d.clearVMHandles(vm)
s.clearVMHandles(vm)
system.TouchNow(&vm)
if err := d.store.UpsertVM(ctx, vm); err != nil {
if err := s.store.UpsertVM(ctx, vm); err != nil {
return model.VMRecord{}, err
}
return vm, nil
}
func (d *Daemon) RestartVM(ctx context.Context, idOrName string) (vm model.VMRecord, err error) {
op := d.beginOperation("vm.restart", "vm_ref", idOrName)
func (s *VMService) RestartVM(ctx context.Context, idOrName string) (vm model.VMRecord, err error) {
op := s.beginOperation("vm.restart", "vm_ref", idOrName)
defer func() {
if err != nil {
op.fail(err, vmLogAttrs(vm)...)
@ -357,34 +357,34 @@ func (d *Daemon) RestartVM(ctx context.Context, idOrName string) (vm model.VMRec
}
op.done(vmLogAttrs(vm)...)
}()
resolved, err := d.FindVM(ctx, idOrName)
resolved, err := s.FindVM(ctx, idOrName)
if err != nil {
return model.VMRecord{}, err
}
return d.withVMLockByID(ctx, resolved.ID, func(vm model.VMRecord) (model.VMRecord, error) {
return s.withVMLockByID(ctx, resolved.ID, func(vm model.VMRecord) (model.VMRecord, error) {
op.stage("stop")
vm, err = d.stopVMLocked(ctx, vm)
vm, err = s.stopVMLocked(ctx, vm)
if err != nil {
return model.VMRecord{}, err
}
image, err := d.store.GetImageByID(ctx, vm.ImageID)
image, err := s.store.GetImageByID(ctx, vm.ImageID)
if err != nil {
return model.VMRecord{}, err
}
op.stage("start", vmLogAttrs(vm)...)
return d.startVMLocked(ctx, vm, image)
return s.startVMLocked(ctx, vm, image)
})
}
func (d *Daemon) DeleteVM(ctx context.Context, idOrName string) (model.VMRecord, error) {
return d.withVMLockByRef(ctx, idOrName, func(vm model.VMRecord) (model.VMRecord, error) {
return d.deleteVMLocked(ctx, vm)
func (s *VMService) DeleteVM(ctx context.Context, idOrName string) (model.VMRecord, error) {
return s.withVMLockByRef(ctx, idOrName, func(vm model.VMRecord) (model.VMRecord, error) {
return s.deleteVMLocked(ctx, vm)
})
}
func (d *Daemon) deleteVMLocked(ctx context.Context, current model.VMRecord) (vm model.VMRecord, err error) {
func (s *VMService) deleteVMLocked(ctx context.Context, current model.VMRecord) (vm model.VMRecord, err error) {
vm = current
op := d.beginOperation("vm.delete", "vm_ref", vm.ID)
op := s.beginOperation("vm.delete", "vm_ref", vm.ID)
defer func() {
if err != nil {
op.fail(err, vmLogAttrs(vm)...)
@ -392,17 +392,17 @@ func (d *Daemon) deleteVMLocked(ctx context.Context, current model.VMRecord) (vm
}
op.done(vmLogAttrs(vm)...)
}()
if d.vmAlive(vm) {
pid := d.vmHandles(vm.ID).PID
if s.vmAlive(vm) {
pid := s.vmHandles(vm.ID).PID
op.stage("kill_running_vm", "pid", pid)
_ = d.hostNet().killVMProcess(ctx, pid)
_ = s.net.killVMProcess(ctx, pid)
}
op.stage("cleanup_runtime")
if err := d.cleanupRuntime(ctx, vm, false); err != nil {
if err := s.cleanupRuntime(ctx, vm, false); err != nil {
return model.VMRecord{}, err
}
op.stage("delete_store_record")
if err := d.store.DeleteVM(ctx, vm.ID); err != nil {
if err := s.store.DeleteVM(ctx, vm.ID); err != nil {
return model.VMRecord{}, err
}
if vm.Runtime.VMDir != "" {
@ -414,6 +414,6 @@ func (d *Daemon) deleteVMLocked(ctx context.Context, current model.VMRecord) (vm
// Drop any host-key pins. A future VM reusing this IP or name
// would otherwise trip the TOFU mismatch branch in
// TOFUHostKeyCallback and fail to connect.
removeVMKnownHosts(d.layout.KnownHostsPath, vm, d.logger)
removeVMKnownHosts(s.layout.KnownHostsPath, vm, s.logger)
return vm, nil
}