Separates what a VM IS (durable intent + identity + deterministic
derived paths — `VMRuntime`) from what is CURRENTLY TRUE about it
(firecracker PID, tap device, loop devices, dm-snapshot target — new
`VMHandles`). The durable state lives in the SQLite `vms` row; the
transient state lives in an in-memory cache on the daemon plus a
per-VM `handles.json` scratch file inside VMDir, rebuilt at startup
from OS inspection. Nothing kernel-level rides the SQLite schema
anymore.
Why:
Persisting ephemeral process handles to SQLite forced reconcile to
treat "running with a stale PID" as a first-class case and mix it
with real state transitions. The schema described what we last
observed, not what the VM is. Every time the observation model
shifted (tap pool, DM naming, pgrep fallback) the reconcile logic
grew a new branch. Splitting lets each layer own what it's good at:
durable records describe intent, in-memory cache + scratch file
describe momentary reality.
Shape:
- `model.VMHandles` = PID, TapDevice, BaseLoop, COWLoop, DMName,
DMDev. Never in SQLite.
- `VMRuntime` keeps: State, GuestIP, APISockPath, VSockPath,
VSockCID, LogPath, MetricsPath, DNSName, VMDir, SystemOverlay,
WorkDiskPath, LastError. All durable or deterministic.
- `handleCache` on `*Daemon` — mutex-guarded map + scratch-file
plumbing (`writeHandlesFile` / `readHandlesFile` /
`rediscoverHandles`). See `internal/daemon/vm_handles.go`.
- `d.vmAlive(vm)` replaces the 20+ inline
`vm.State==Running && ProcessRunning(vm.Runtime.PID, apiSock)`
spreads. Single source of truth for liveness.
- Startup reconcile: per running VM, load the scratch file, pgrep
the api sock, either keep (cache seeded from scratch) or demote
to stopped (scratch handles passed to cleanupRuntime first so DM
/ loops / tap actually get torn down).
Verification:
- `go test ./...` green.
- Live: `banger vm run --name handles-test -- cat /etc/hostname`
starts; `handles.json` appears in VMDir with the expected PID,
tap, loops, DM.
- `kill -9 $(pgrep bangerd)` while the VM is running, re-invoke the
CLI, daemon auto-starts, reconcile recognises the VM as alive,
`banger vm ssh` still connects, `banger vm delete` cleans up.
Tests added:
- vm_handles_test.go: scratch-file roundtrip, missing/corrupt file
behaviour, cache concurrency, rediscoverHandles prefers pgrep
over scratch, returns scratch contents even when process is
dead (so cleanup can tear down kernel state).
- vm_test.go: reconcile test rewritten to exercise the new flow
(write scratch → reconcile reads it → verifies process is gone →
issues dmsetup/losetup teardown).
ARCHITECTURE.md updated; `handles` added to Daemon field docs.
415 lines
13 KiB
Go
415 lines
13 KiB
Go
package daemon
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"os"
|
|
"path/filepath"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
"banger/internal/api"
|
|
"banger/internal/firecracker"
|
|
"banger/internal/imagepull"
|
|
"banger/internal/model"
|
|
"banger/internal/system"
|
|
)
|
|
|
|
func (d *Daemon) StartVM(ctx context.Context, idOrName string) (model.VMRecord, error) {
|
|
return d.withVMLockByRef(ctx, idOrName, func(vm model.VMRecord) (model.VMRecord, error) {
|
|
image, err := d.store.GetImageByID(ctx, vm.ImageID)
|
|
if err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
if d.vmAlive(vm) {
|
|
if d.logger != nil {
|
|
d.logger.Info("vm already running", vmLogAttrs(vm)...)
|
|
}
|
|
return vm, nil
|
|
}
|
|
return d.startVMLocked(ctx, vm, image)
|
|
})
|
|
}
|
|
|
|
func (d *Daemon) startVMLocked(ctx context.Context, vm model.VMRecord, image model.Image) (_ model.VMRecord, err error) {
|
|
op := d.beginOperation("vm.start", append(vmLogAttrs(vm), imageLogAttrs(image)...)...)
|
|
defer func() {
|
|
if err != nil {
|
|
err = annotateLogPath(err, vm.Runtime.LogPath)
|
|
op.fail(err, vmLogAttrs(vm)...)
|
|
return
|
|
}
|
|
op.done(vmLogAttrs(vm)...)
|
|
}()
|
|
op.stage("preflight")
|
|
vmCreateStage(ctx, "preflight", "checking host prerequisites")
|
|
if err := d.validateStartPrereqs(ctx, vm, image); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
if err := os.MkdirAll(vm.Runtime.VMDir, 0o755); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
op.stage("cleanup_runtime")
|
|
if err := d.cleanupRuntime(ctx, vm, true); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
d.clearVMHandles(vm)
|
|
op.stage("bridge")
|
|
if err := d.ensureBridge(ctx); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
op.stage("socket_dir")
|
|
if err := d.ensureSocketDir(); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
|
|
shortID := system.ShortID(vm.ID)
|
|
apiSock := filepath.Join(d.layout.RuntimeDir, "fc-"+shortID+".sock")
|
|
dmName := "fc-rootfs-" + shortID
|
|
tapName := "tap-fc-" + shortID
|
|
if strings.TrimSpace(vm.Runtime.VSockPath) == "" {
|
|
vm.Runtime.VSockPath = defaultVSockPath(d.layout.RuntimeDir, vm.ID)
|
|
}
|
|
if vm.Runtime.VSockCID == 0 {
|
|
vm.Runtime.VSockCID, err = defaultVSockCID(vm.Runtime.GuestIP)
|
|
if err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
}
|
|
if err := os.RemoveAll(apiSock); err != nil && !os.IsNotExist(err) {
|
|
return model.VMRecord{}, err
|
|
}
|
|
if err := os.RemoveAll(vm.Runtime.VSockPath); err != nil && !os.IsNotExist(err) {
|
|
return model.VMRecord{}, err
|
|
}
|
|
|
|
op.stage("system_overlay", "overlay_path", vm.Runtime.SystemOverlay)
|
|
vmCreateStage(ctx, "prepare_rootfs", "preparing system overlay")
|
|
if err := d.ensureSystemOverlay(ctx, &vm); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
|
|
op.stage("dm_snapshot", "dm_name", dmName)
|
|
vmCreateStage(ctx, "prepare_rootfs", "creating root filesystem snapshot")
|
|
snapHandles, err := d.createDMSnapshot(ctx, image.RootfsPath, vm.Runtime.SystemOverlay, dmName)
|
|
if err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
// Live handles are threaded through this function as a local and
|
|
// pushed to the cache via setVMHandles once we have every piece.
|
|
// The cache update must happen BEFORE any step that reads handles
|
|
// back (e.g. cleanupRuntime via cleanupOnErr) — otherwise loops
|
|
// and DM would leak on an early failure.
|
|
live := model.VMHandles{
|
|
BaseLoop: snapHandles.BaseLoop,
|
|
COWLoop: snapHandles.COWLoop,
|
|
DMName: snapHandles.DMName,
|
|
DMDev: snapHandles.DMDev,
|
|
}
|
|
d.setVMHandles(vm, live)
|
|
|
|
vm.Runtime.APISockPath = apiSock
|
|
vm.Runtime.State = model.VMStateRunning
|
|
vm.State = model.VMStateRunning
|
|
vm.Runtime.LastError = ""
|
|
|
|
cleanupOnErr := func(err error) (model.VMRecord, error) {
|
|
vm.State = model.VMStateError
|
|
vm.Runtime.State = model.VMStateError
|
|
vm.Runtime.LastError = err.Error()
|
|
op.stage("cleanup_after_failure", "error", err.Error())
|
|
if cleanupErr := d.cleanupRuntime(context.Background(), vm, true); cleanupErr != nil {
|
|
err = errors.Join(err, cleanupErr)
|
|
}
|
|
d.clearVMHandles(vm)
|
|
_ = d.store.UpsertVM(context.Background(), vm)
|
|
return model.VMRecord{}, err
|
|
}
|
|
|
|
op.stage("patch_root_overlay")
|
|
vmCreateStage(ctx, "prepare_rootfs", "writing guest configuration")
|
|
if err := d.patchRootOverlay(ctx, vm, image); err != nil {
|
|
return cleanupOnErr(err)
|
|
}
|
|
op.stage("prepare_host_features")
|
|
vmCreateStage(ctx, "prepare_host_features", "preparing host-side vm features")
|
|
if err := d.prepareCapabilityHosts(ctx, &vm, image); err != nil {
|
|
return cleanupOnErr(err)
|
|
}
|
|
op.stage("tap")
|
|
tap, err := d.acquireTap(ctx, tapName)
|
|
if err != nil {
|
|
return cleanupOnErr(err)
|
|
}
|
|
live.TapDevice = tap
|
|
d.setVMHandles(vm, live)
|
|
op.stage("metrics_file", "metrics_path", vm.Runtime.MetricsPath)
|
|
if err := os.WriteFile(vm.Runtime.MetricsPath, nil, 0o644); err != nil {
|
|
return cleanupOnErr(err)
|
|
}
|
|
|
|
op.stage("firecracker_binary")
|
|
fcPath, err := d.firecrackerBinary()
|
|
if err != nil {
|
|
return cleanupOnErr(err)
|
|
}
|
|
op.stage("firecracker_launch", "log_path", vm.Runtime.LogPath, "metrics_path", vm.Runtime.MetricsPath)
|
|
vmCreateStage(ctx, "boot_firecracker", "starting firecracker")
|
|
kernelArgs := system.BuildBootArgs(vm.Name)
|
|
if strings.TrimSpace(image.InitrdPath) == "" {
|
|
// Direct-boot image (no initramfs) — the rootfs may be a
|
|
// container image without /sbin/init or iproute2. Use:
|
|
// 1. Kernel-level IP config via ip= cmdline (CONFIG_IP_PNP),
|
|
// so the network is up before init runs — no ip(8) needed.
|
|
// 2. init= pointing at our universal wrapper which installs
|
|
// systemd+sshd on first boot if missing.
|
|
kernelArgs = system.BuildBootArgsWithKernelIP(
|
|
vm.Name, vm.Runtime.GuestIP, d.config.BridgeIP, d.config.DefaultDNS,
|
|
) + " init=" + imagepull.FirstBootScriptPath
|
|
}
|
|
|
|
machineConfig := firecracker.MachineConfig{
|
|
BinaryPath: fcPath,
|
|
VMID: vm.ID,
|
|
SocketPath: apiSock,
|
|
LogPath: vm.Runtime.LogPath,
|
|
MetricsPath: vm.Runtime.MetricsPath,
|
|
KernelImagePath: image.KernelPath,
|
|
InitrdPath: image.InitrdPath,
|
|
KernelArgs: kernelArgs,
|
|
Drives: []firecracker.DriveConfig{{
|
|
ID: "rootfs",
|
|
Path: live.DMDev,
|
|
ReadOnly: false,
|
|
IsRoot: true,
|
|
}},
|
|
TapDevice: tap,
|
|
VSockPath: vm.Runtime.VSockPath,
|
|
VSockCID: vm.Runtime.VSockCID,
|
|
VCPUCount: vm.Spec.VCPUCount,
|
|
MemoryMiB: vm.Spec.MemoryMiB,
|
|
Logger: d.logger,
|
|
}
|
|
d.contributeMachineConfig(&machineConfig, vm, image)
|
|
machine, err := firecracker.NewMachine(ctx, machineConfig)
|
|
if err != nil {
|
|
return cleanupOnErr(err)
|
|
}
|
|
if err := machine.Start(ctx); err != nil {
|
|
// Use a fresh context: the request ctx may already be cancelled (client
|
|
// disconnect), but we still need the PID so cleanupRuntime can kill the
|
|
// Firecracker process that was spawned before the failure.
|
|
live.PID = d.resolveFirecrackerPID(context.Background(), machine, apiSock)
|
|
d.setVMHandles(vm, live)
|
|
return cleanupOnErr(err)
|
|
}
|
|
live.PID = d.resolveFirecrackerPID(context.Background(), machine, apiSock)
|
|
d.setVMHandles(vm, live)
|
|
op.debugStage("firecracker_started", "pid", live.PID)
|
|
op.stage("socket_access", "api_socket", apiSock)
|
|
if err := d.ensureSocketAccess(ctx, apiSock, "firecracker api socket"); err != nil {
|
|
return cleanupOnErr(err)
|
|
}
|
|
op.stage("vsock_access", "vsock_path", vm.Runtime.VSockPath, "vsock_cid", vm.Runtime.VSockCID)
|
|
if err := d.ensureSocketAccess(ctx, vm.Runtime.VSockPath, "firecracker vsock socket"); err != nil {
|
|
return cleanupOnErr(err)
|
|
}
|
|
vmCreateStage(ctx, "wait_vsock_agent", "waiting for guest vsock agent")
|
|
if err := waitForGuestVSockAgent(ctx, d.logger, vm.Runtime.VSockPath, vsockReadyWait); err != nil {
|
|
return cleanupOnErr(err)
|
|
}
|
|
op.stage("post_start_features")
|
|
vmCreateStage(ctx, "wait_guest_ready", "waiting for guest services")
|
|
if err := d.postStartCapabilities(ctx, vm, image); err != nil {
|
|
return cleanupOnErr(err)
|
|
}
|
|
system.TouchNow(&vm)
|
|
op.stage("persist")
|
|
vmCreateStage(ctx, "finalize", "saving vm state")
|
|
if err := d.store.UpsertVM(ctx, vm); err != nil {
|
|
return cleanupOnErr(err)
|
|
}
|
|
return vm, nil
|
|
}
|
|
|
|
func (d *Daemon) StopVM(ctx context.Context, idOrName string) (model.VMRecord, error) {
|
|
return d.withVMLockByRef(ctx, idOrName, func(vm model.VMRecord) (model.VMRecord, error) {
|
|
return d.stopVMLocked(ctx, vm)
|
|
})
|
|
}
|
|
|
|
func (d *Daemon) stopVMLocked(ctx context.Context, current model.VMRecord) (vm model.VMRecord, err error) {
|
|
vm = current
|
|
op := d.beginOperation("vm.stop", "vm_ref", vm.ID)
|
|
defer func() {
|
|
if err != nil {
|
|
op.fail(err, vmLogAttrs(vm)...)
|
|
return
|
|
}
|
|
op.done(vmLogAttrs(vm)...)
|
|
}()
|
|
if !d.vmAlive(vm) {
|
|
op.stage("cleanup_stale_runtime")
|
|
if err := d.cleanupRuntime(ctx, vm, true); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
vm.State = model.VMStateStopped
|
|
vm.Runtime.State = model.VMStateStopped
|
|
d.clearVMHandles(vm)
|
|
if err := d.store.UpsertVM(ctx, vm); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
return vm, nil
|
|
}
|
|
pid := d.vmHandles(vm.ID).PID
|
|
op.stage("graceful_shutdown")
|
|
if err := d.sendCtrlAltDel(ctx, vm); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
op.stage("wait_for_exit", "pid", pid)
|
|
if err := d.waitForExit(ctx, pid, vm.Runtime.APISockPath, gracefulShutdownWait); err != nil {
|
|
if !errors.Is(err, errWaitForExitTimeout) {
|
|
return model.VMRecord{}, err
|
|
}
|
|
op.stage("graceful_shutdown_timeout", "pid", pid)
|
|
}
|
|
op.stage("cleanup_runtime")
|
|
if err := d.cleanupRuntime(ctx, vm, true); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
vm.State = model.VMStateStopped
|
|
vm.Runtime.State = model.VMStateStopped
|
|
d.clearVMHandles(vm)
|
|
system.TouchNow(&vm)
|
|
if err := d.store.UpsertVM(ctx, vm); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
return vm, nil
|
|
}
|
|
|
|
func (d *Daemon) KillVM(ctx context.Context, params api.VMKillParams) (model.VMRecord, error) {
|
|
return d.withVMLockByRef(ctx, params.IDOrName, func(vm model.VMRecord) (model.VMRecord, error) {
|
|
return d.killVMLocked(ctx, vm, params.Signal)
|
|
})
|
|
}
|
|
|
|
func (d *Daemon) killVMLocked(ctx context.Context, current model.VMRecord, signalValue string) (vm model.VMRecord, err error) {
|
|
vm = current
|
|
op := d.beginOperation("vm.kill", "vm_ref", vm.ID, "signal", signalValue)
|
|
defer func() {
|
|
if err != nil {
|
|
op.fail(err, vmLogAttrs(vm)...)
|
|
return
|
|
}
|
|
op.done(vmLogAttrs(vm)...)
|
|
}()
|
|
if !d.vmAlive(vm) {
|
|
op.stage("cleanup_stale_runtime")
|
|
if err := d.cleanupRuntime(ctx, vm, true); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
vm.State = model.VMStateStopped
|
|
vm.Runtime.State = model.VMStateStopped
|
|
d.clearVMHandles(vm)
|
|
if err := d.store.UpsertVM(ctx, vm); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
return vm, nil
|
|
}
|
|
|
|
signal := strings.TrimSpace(signalValue)
|
|
if signal == "" {
|
|
signal = "TERM"
|
|
}
|
|
pid := d.vmHandles(vm.ID).PID
|
|
op.stage("send_signal", "pid", pid, "signal", signal)
|
|
if _, err := d.runner.RunSudo(ctx, "kill", "-"+signal, strconv.Itoa(pid)); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
op.stage("wait_for_exit", "pid", pid)
|
|
if err := d.waitForExit(ctx, pid, vm.Runtime.APISockPath, 30*time.Second); err != nil {
|
|
if !errors.Is(err, errWaitForExitTimeout) {
|
|
return model.VMRecord{}, err
|
|
}
|
|
op.stage("signal_timeout", "pid", pid, "signal", signal)
|
|
}
|
|
op.stage("cleanup_runtime")
|
|
if err := d.cleanupRuntime(ctx, vm, true); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
vm.State = model.VMStateStopped
|
|
vm.Runtime.State = model.VMStateStopped
|
|
d.clearVMHandles(vm)
|
|
system.TouchNow(&vm)
|
|
if err := d.store.UpsertVM(ctx, vm); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
return vm, nil
|
|
}
|
|
|
|
func (d *Daemon) RestartVM(ctx context.Context, idOrName string) (vm model.VMRecord, err error) {
|
|
op := d.beginOperation("vm.restart", "vm_ref", idOrName)
|
|
defer func() {
|
|
if err != nil {
|
|
op.fail(err, vmLogAttrs(vm)...)
|
|
return
|
|
}
|
|
op.done(vmLogAttrs(vm)...)
|
|
}()
|
|
resolved, err := d.FindVM(ctx, idOrName)
|
|
if err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
return d.withVMLockByID(ctx, resolved.ID, func(vm model.VMRecord) (model.VMRecord, error) {
|
|
op.stage("stop")
|
|
vm, err = d.stopVMLocked(ctx, vm)
|
|
if err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
image, err := d.store.GetImageByID(ctx, vm.ImageID)
|
|
if err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
op.stage("start", vmLogAttrs(vm)...)
|
|
return d.startVMLocked(ctx, vm, image)
|
|
})
|
|
}
|
|
|
|
func (d *Daemon) DeleteVM(ctx context.Context, idOrName string) (model.VMRecord, error) {
|
|
return d.withVMLockByRef(ctx, idOrName, func(vm model.VMRecord) (model.VMRecord, error) {
|
|
return d.deleteVMLocked(ctx, vm)
|
|
})
|
|
}
|
|
|
|
func (d *Daemon) deleteVMLocked(ctx context.Context, current model.VMRecord) (vm model.VMRecord, err error) {
|
|
vm = current
|
|
op := d.beginOperation("vm.delete", "vm_ref", vm.ID)
|
|
defer func() {
|
|
if err != nil {
|
|
op.fail(err, vmLogAttrs(vm)...)
|
|
return
|
|
}
|
|
op.done(vmLogAttrs(vm)...)
|
|
}()
|
|
if d.vmAlive(vm) {
|
|
pid := d.vmHandles(vm.ID).PID
|
|
op.stage("kill_running_vm", "pid", pid)
|
|
_ = d.killVMProcess(ctx, pid)
|
|
}
|
|
op.stage("cleanup_runtime")
|
|
if err := d.cleanupRuntime(ctx, vm, false); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
op.stage("delete_store_record")
|
|
if err := d.store.DeleteVM(ctx, vm.ID); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
if vm.Runtime.VMDir != "" {
|
|
op.stage("delete_vm_dir", "vm_dir", vm.Runtime.VMDir)
|
|
if err := os.RemoveAll(vm.Runtime.VMDir); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
}
|
|
return vm, nil
|
|
}
|