vm state: split transient kernel/process handles off the durable schema
Separates what a VM IS (durable intent + identity + deterministic
derived paths — `VMRuntime`) from what is CURRENTLY TRUE about it
(firecracker PID, tap device, loop devices, dm-snapshot target — new
`VMHandles`). The durable state lives in the SQLite `vms` row; the
transient state lives in an in-memory cache on the daemon plus a
per-VM `handles.json` scratch file inside VMDir, rebuilt at startup
from OS inspection. Nothing kernel-level rides the SQLite schema
anymore.
Why:
Persisting ephemeral process handles to SQLite forced reconcile to
treat "running with a stale PID" as a first-class case and mix it
with real state transitions. The schema described what we last
observed, not what the VM is. Every time the observation model
shifted (tap pool, DM naming, pgrep fallback) the reconcile logic
grew a new branch. Splitting lets each layer own what it's good at:
durable records describe intent, in-memory cache + scratch file
describe momentary reality.
Shape:
- `model.VMHandles` = PID, TapDevice, BaseLoop, COWLoop, DMName,
DMDev. Never in SQLite.
- `VMRuntime` keeps: State, GuestIP, APISockPath, VSockPath,
VSockCID, LogPath, MetricsPath, DNSName, VMDir, SystemOverlay,
WorkDiskPath, LastError. All durable or deterministic.
- `handleCache` on `*Daemon` — mutex-guarded map + scratch-file
plumbing (`writeHandlesFile` / `readHandlesFile` /
`rediscoverHandles`). See `internal/daemon/vm_handles.go`.
- `d.vmAlive(vm)` replaces the 20+ inline
`vm.State==Running && ProcessRunning(vm.Runtime.PID, apiSock)`
spreads. Single source of truth for liveness.
- Startup reconcile: per running VM, load the scratch file, pgrep
the api sock, either keep (cache seeded from scratch) or demote
to stopped (scratch handles passed to cleanupRuntime first so DM
/ loops / tap actually get torn down).
Verification:
- `go test ./...` green.
- Live: `banger vm run --name handles-test -- cat /etc/hostname`
starts; `handles.json` appears in VMDir with the expected PID,
tap, loops, DM.
- `kill -9 $(pgrep bangerd)` while the VM is running, re-invoke the
CLI, daemon auto-starts, reconcile recognises the VM as alive,
`banger vm ssh` still connects, `banger vm delete` cleans up.
Tests added:
- vm_handles_test.go: scratch-file roundtrip, missing/corrupt file
behaviour, cache concurrency, rediscoverHandles prefers pgrep
over scratch, returns scratch contents even when process is
dead (so cleanup can tear down kernel state).
- vm_test.go: reconcile test rewritten to exercise the new flow
(write scratch → reconcile reads it → verifies process is gone →
issues dmsetup/losetup teardown).
ARCHITECTURE.md updated; `handles` added to Daemon field docs.
This commit is contained in:
parent
2e6e64bc04
commit
687fcf0b59
27 changed files with 688 additions and 152 deletions
|
|
@ -18,6 +18,13 @@ owning types:
|
||||||
workspace operations on a single VM (two simultaneous tar imports
|
workspace operations on a single VM (two simultaneous tar imports
|
||||||
would clobber each other) without touching `vmLocks`, so
|
would clobber each other) without touching `vmLocks`, so
|
||||||
`vm stop` / `delete` / `restart` never queue behind a slow import.
|
`vm stop` / `delete` / `restart` never queue behind a slow import.
|
||||||
|
- `handles *handleCache` — in-memory map of per-VM transient kernel/
|
||||||
|
process handles (PID, tap device, loop devices, DM target). The
|
||||||
|
cache is rebuildable: each VM directory holds a small
|
||||||
|
`handles.json` scratch file that the daemon reads at startup to
|
||||||
|
reconstruct the cache and verify processes against `/proc` via
|
||||||
|
pgrep. Nothing in the durable `vms` SQLite row describes transient
|
||||||
|
kernel state. See `internal/daemon/vm_handles.go`.
|
||||||
- `createVMMu sync.Mutex` — serialises `CreateVM` (guards name uniqueness
|
- `createVMMu sync.Mutex` — serialises `CreateVM` (guards name uniqueness
|
||||||
+ guest IP allocation window).
|
+ guest IP allocation window).
|
||||||
- `imageOpsMu sync.Mutex` — serialises image-registry mutations
|
- `imageOpsMu sync.Mutex` — serialises image-registry mutations
|
||||||
|
|
|
||||||
|
|
@ -277,9 +277,10 @@ func (natCapability) Cleanup(ctx context.Context, d *Daemon, vm model.VMRecord)
|
||||||
if !vm.Spec.NATEnabled {
|
if !vm.Spec.NATEnabled {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
if strings.TrimSpace(vm.Runtime.GuestIP) == "" || strings.TrimSpace(vm.Runtime.TapDevice) == "" {
|
tap := d.vmHandles(vm.ID).TapDevice
|
||||||
|
if strings.TrimSpace(vm.Runtime.GuestIP) == "" || strings.TrimSpace(tap) == "" {
|
||||||
if d.logger != nil {
|
if d.logger != nil {
|
||||||
d.logger.Debug("skipping nat cleanup without runtime network handles", append(vmLogAttrs(vm), "guest_ip", vm.Runtime.GuestIP, "tap_device", vm.Runtime.TapDevice)...)
|
d.logger.Debug("skipping nat cleanup without runtime network handles", append(vmLogAttrs(vm), "guest_ip", vm.Runtime.GuestIP, "tap_device", tap)...)
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
@ -290,7 +291,7 @@ func (natCapability) ApplyConfigChange(ctx context.Context, d *Daemon, before, a
|
||||||
if before.Spec.NATEnabled == after.Spec.NATEnabled {
|
if before.Spec.NATEnabled == after.Spec.NATEnabled {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
if after.State != model.VMStateRunning || !system.ProcessRunning(after.Runtime.PID, after.Runtime.APISockPath) {
|
if !d.vmAlive(after) {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
return d.ensureNAT(ctx, after, after.Spec.NATEnabled)
|
return d.ensureNAT(ctx, after, after.Spec.NATEnabled)
|
||||||
|
|
|
||||||
|
|
@ -44,7 +44,14 @@ type Daemon struct {
|
||||||
// other's tar streams). It is a SEPARATE scope from vmLocks so
|
// other's tar streams). It is a SEPARATE scope from vmLocks so
|
||||||
// slow guest I/O — SSH dial, tar upload, chmod — does not block
|
// slow guest I/O — SSH dial, tar upload, chmod — does not block
|
||||||
// vm stop/delete/restart. See ARCHITECTURE.md.
|
// vm stop/delete/restart. See ARCHITECTURE.md.
|
||||||
workspaceLocks vmLockSet
|
workspaceLocks vmLockSet
|
||||||
|
// handles caches per-VM transient kernel/process handles (PID,
|
||||||
|
// tap device, loop devices, DM name/device). Populated at vm
|
||||||
|
// start and at daemon startup reconcile; cleared on stop/delete.
|
||||||
|
// See internal/daemon/vm_handles.go — persistent durable state
|
||||||
|
// lives in the store, this is rebuildable from a per-VM
|
||||||
|
// handles.json scratch file and OS inspection.
|
||||||
|
handles *handleCache
|
||||||
sessions sessionRegistry
|
sessions sessionRegistry
|
||||||
tapPool tapPool
|
tapPool tapPool
|
||||||
closing chan struct{}
|
closing chan struct{}
|
||||||
|
|
@ -94,6 +101,7 @@ func Open(ctx context.Context) (d *Daemon, err error) {
|
||||||
logger: logger,
|
logger: logger,
|
||||||
closing: make(chan struct{}),
|
closing: make(chan struct{}),
|
||||||
pid: os.Getpid(),
|
pid: os.Getpid(),
|
||||||
|
handles: newHandleCache(),
|
||||||
sessions: newSessionRegistry(),
|
sessions: newSessionRegistry(),
|
||||||
}
|
}
|
||||||
d.ensureVMSSHClientConfig()
|
d.ensureVMSSHClientConfig()
|
||||||
|
|
@ -382,7 +390,7 @@ func (d *Daemon) dispatch(ctx context.Context, req rpc.Request) rpc.Response {
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return rpc.NewError("not_found", err.Error())
|
return rpc.NewError("not_found", err.Error())
|
||||||
}
|
}
|
||||||
if vm.State != model.VMStateRunning || !system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
|
if !d.vmAlive(vm) {
|
||||||
return rpc.NewError("not_running", fmt.Sprintf("vm %s is not running", vm.Name))
|
return rpc.NewError("not_running", fmt.Sprintf("vm %s is not running", vm.Name))
|
||||||
}
|
}
|
||||||
return marshalResultOrError(api.VMSSHResult{Name: vm.Name, GuestIP: vm.Runtime.GuestIP}, nil)
|
return marshalResultOrError(api.VMSSHResult{Name: vm.Name, GuestIP: vm.Runtime.GuestIP}, nil)
|
||||||
|
|
@ -609,16 +617,32 @@ func (d *Daemon) reconcile(ctx context.Context) error {
|
||||||
for _, vm := range vms {
|
for _, vm := range vms {
|
||||||
if err := d.withVMLockByIDErr(ctx, vm.ID, func(vm model.VMRecord) error {
|
if err := d.withVMLockByIDErr(ctx, vm.ID, func(vm model.VMRecord) error {
|
||||||
if vm.State != model.VMStateRunning {
|
if vm.State != model.VMStateRunning {
|
||||||
|
// Belt-and-braces: a stopped VM should never have a
|
||||||
|
// scratch file or a cache entry. Clean up anything
|
||||||
|
// left by an ungraceful previous daemon crash.
|
||||||
|
d.clearVMHandles(vm)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
if system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
|
// Rebuild the in-memory handle cache by loading the per-VM
|
||||||
|
// scratch file and verifying the firecracker process is
|
||||||
|
// still alive.
|
||||||
|
h, alive, err := d.rediscoverHandles(ctx, vm)
|
||||||
|
if err != nil && d.logger != nil {
|
||||||
|
d.logger.Warn("rediscover handles failed", "vm_id", vm.ID, "error", err.Error())
|
||||||
|
}
|
||||||
|
// Either way, seed the cache with what the scratch file
|
||||||
|
// claimed. If alive, subsequent vmAlive() calls pass; if
|
||||||
|
// not, cleanupRuntime needs these handles to know which
|
||||||
|
// kernel resources (DM / loops / tap) to tear down.
|
||||||
|
d.setVMHandlesInMemory(vm.ID, h)
|
||||||
|
if alive {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
op.stage("stale_vm", vmLogAttrs(vm)...)
|
op.stage("stale_vm", vmLogAttrs(vm)...)
|
||||||
_ = d.cleanupRuntime(ctx, vm, true)
|
_ = d.cleanupRuntime(ctx, vm, true)
|
||||||
vm.State = model.VMStateStopped
|
vm.State = model.VMStateStopped
|
||||||
vm.Runtime.State = model.VMStateStopped
|
vm.Runtime.State = model.VMStateStopped
|
||||||
clearRuntimeHandles(&vm)
|
d.clearVMHandles(vm)
|
||||||
vm.UpdatedAt = model.Now()
|
vm.UpdatedAt = model.Now()
|
||||||
return d.store.UpsertVM(ctx, vm)
|
return d.store.UpsertVM(ctx, vm)
|
||||||
}); err != nil {
|
}); err != nil {
|
||||||
|
|
|
||||||
|
|
@ -52,7 +52,7 @@ func (d *Daemon) DashboardSummary(ctx context.Context) (api.DashboardSummary, er
|
||||||
summary.Banger.ConfiguredDiskBytes += vm.Spec.WorkDiskSizeBytes
|
summary.Banger.ConfiguredDiskBytes += vm.Spec.WorkDiskSizeBytes
|
||||||
summary.Banger.UsedSystemOverlayBytes += vm.Stats.SystemOverlayBytes
|
summary.Banger.UsedSystemOverlayBytes += vm.Stats.SystemOverlayBytes
|
||||||
summary.Banger.UsedWorkDiskBytes += vm.Stats.WorkDiskBytes
|
summary.Banger.UsedWorkDiskBytes += vm.Stats.WorkDiskBytes
|
||||||
if vm.State == model.VMStateRunning && system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
|
if d.vmAlive(vm) {
|
||||||
summary.Banger.RunningVMCount++
|
summary.Banger.RunningVMCount++
|
||||||
summary.Banger.RunningCPUPercent += vm.Stats.CPUPercent
|
summary.Banger.RunningCPUPercent += vm.Stats.CPUPercent
|
||||||
summary.Banger.RunningRSSBytes += vm.Stats.RSSBytes
|
summary.Banger.RunningRSSBytes += vm.Stats.RSSBytes
|
||||||
|
|
|
||||||
|
|
@ -74,7 +74,7 @@ func (d *Daemon) refreshGuestSession(ctx context.Context, vm model.VMRecord, s m
|
||||||
return s, err
|
return s, err
|
||||||
}
|
}
|
||||||
original := s
|
original := s
|
||||||
session.ApplyStateSnapshot(&s, snapshot, vm.State == model.VMStateRunning && system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath))
|
session.ApplyStateSnapshot(&s, snapshot, d.vmAlive(vm))
|
||||||
if session.StateChanged(original, s) {
|
if session.StateChanged(original, s) {
|
||||||
s.UpdatedAt = model.Now()
|
s.UpdatedAt = model.Now()
|
||||||
if err := d.store.UpsertGuestSession(ctx, s); err != nil {
|
if err := d.store.UpsertGuestSession(ctx, s); err != nil {
|
||||||
|
|
@ -85,7 +85,7 @@ func (d *Daemon) refreshGuestSession(ctx context.Context, vm model.VMRecord, s m
|
||||||
}
|
}
|
||||||
|
|
||||||
func (d *Daemon) inspectGuestSessionState(ctx context.Context, vm model.VMRecord, s model.GuestSession) (session.StateSnapshot, error) {
|
func (d *Daemon) inspectGuestSessionState(ctx context.Context, vm model.VMRecord, s model.GuestSession) (session.StateSnapshot, error) {
|
||||||
if vm.State == model.VMStateRunning && system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
|
if d.vmAlive(vm) {
|
||||||
client, err := guest.Dial(ctx, net.JoinHostPort(vm.Runtime.GuestIP, "22"), d.config.SSHKeyPath)
|
client, err := guest.Dial(ctx, net.JoinHostPort(vm.Runtime.GuestIP, "22"), d.config.SSHKeyPath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return session.StateSnapshot{}, err
|
return session.StateSnapshot{}, err
|
||||||
|
|
|
||||||
|
|
@ -94,7 +94,6 @@ func TestSendToGuestSession_HappyPath(t *testing.T) {
|
||||||
vm := testVM("sendbox", "image-send", "172.16.0.88")
|
vm := testVM("sendbox", "image-send", "172.16.0.88")
|
||||||
vm.State = model.VMStateRunning
|
vm.State = model.VMStateRunning
|
||||||
vm.Runtime.State = model.VMStateRunning
|
vm.Runtime.State = model.VMStateRunning
|
||||||
vm.Runtime.PID = firecracker.Process.Pid
|
|
||||||
vm.Runtime.APISockPath = apiSock
|
vm.Runtime.APISockPath = apiSock
|
||||||
upsertDaemonVM(t, ctx, db, vm)
|
upsertDaemonVM(t, ctx, db, vm)
|
||||||
|
|
||||||
|
|
@ -105,6 +104,7 @@ func TestSendToGuestSession_HappyPath(t *testing.T) {
|
||||||
|
|
||||||
fake := &recordingGuestSSHClient{}
|
fake := &recordingGuestSSHClient{}
|
||||||
d := newSendTestDaemon(t, db, fake)
|
d := newSendTestDaemon(t, db, fake)
|
||||||
|
d.setVMHandlesInMemory(vm.ID, model.VMHandles{PID: firecracker.Process.Pid})
|
||||||
|
|
||||||
payload := []byte(`{"type":"abort"}` + "\n")
|
payload := []byte(`{"type":"abort"}` + "\n")
|
||||||
result, err := d.SendToGuestSession(ctx, api.GuestSessionSendParams{
|
result, err := d.SendToGuestSession(ctx, api.GuestSessionSendParams{
|
||||||
|
|
@ -159,7 +159,6 @@ func TestSendToGuestSession_EmptyPayload(t *testing.T) {
|
||||||
vm := testVM("sendbox-empty", "image-send", "172.16.0.89")
|
vm := testVM("sendbox-empty", "image-send", "172.16.0.89")
|
||||||
vm.State = model.VMStateRunning
|
vm.State = model.VMStateRunning
|
||||||
vm.Runtime.State = model.VMStateRunning
|
vm.Runtime.State = model.VMStateRunning
|
||||||
vm.Runtime.PID = firecracker.Process.Pid
|
|
||||||
vm.Runtime.APISockPath = apiSock
|
vm.Runtime.APISockPath = apiSock
|
||||||
upsertDaemonVM(t, ctx, db, vm)
|
upsertDaemonVM(t, ctx, db, vm)
|
||||||
|
|
||||||
|
|
@ -170,6 +169,7 @@ func TestSendToGuestSession_EmptyPayload(t *testing.T) {
|
||||||
|
|
||||||
fake := &recordingGuestSSHClient{}
|
fake := &recordingGuestSSHClient{}
|
||||||
d := newSendTestDaemon(t, db, fake)
|
d := newSendTestDaemon(t, db, fake)
|
||||||
|
d.setVMHandlesInMemory(vm.ID, model.VMHandles{PID: firecracker.Process.Pid})
|
||||||
|
|
||||||
result, err := d.SendToGuestSession(ctx, api.GuestSessionSendParams{
|
result, err := d.SendToGuestSession(ctx, api.GuestSessionSendParams{
|
||||||
VMIDOrName: vm.Name,
|
VMIDOrName: vm.Name,
|
||||||
|
|
@ -423,7 +423,6 @@ func TestPrepareWorkspaceThenStartGuestSessionPassesCWDPreflight(t *testing.T) {
|
||||||
vm := testVM("pi-devbox", "image-pi", "172.16.0.77")
|
vm := testVM("pi-devbox", "image-pi", "172.16.0.77")
|
||||||
vm.State = model.VMStateRunning
|
vm.State = model.VMStateRunning
|
||||||
vm.Runtime.State = model.VMStateRunning
|
vm.Runtime.State = model.VMStateRunning
|
||||||
vm.Runtime.PID = firecracker.Process.Pid
|
|
||||||
vm.Runtime.APISockPath = apiSock
|
vm.Runtime.APISockPath = apiSock
|
||||||
upsertDaemonVM(t, ctx, db, vm)
|
upsertDaemonVM(t, ctx, db, vm)
|
||||||
|
|
||||||
|
|
@ -433,6 +432,7 @@ func TestPrepareWorkspaceThenStartGuestSessionPassesCWDPreflight(t *testing.T) {
|
||||||
config: model.DaemonConfig{SSHKeyPath: filepath.Join(t.TempDir(), "id_ed25519")},
|
config: model.DaemonConfig{SSHKeyPath: filepath.Join(t.TempDir(), "id_ed25519")},
|
||||||
logger: slog.New(slog.NewTextHandler(io.Discard, nil)),
|
logger: slog.New(slog.NewTextHandler(io.Discard, nil)),
|
||||||
}
|
}
|
||||||
|
d.setVMHandlesInMemory(vm.ID, model.VMHandles{PID: firecracker.Process.Pid})
|
||||||
d.guestWaitForSSH = func(context.Context, string, string, time.Duration) error { return nil }
|
d.guestWaitForSSH = func(context.Context, string, string, time.Duration) error { return nil }
|
||||||
d.guestDial = func(context.Context, string, string) (guestSSHClient, error) { return fakeClient, nil }
|
d.guestDial = func(context.Context, string, string) (guestSSHClient, error) { return fakeClient, nil }
|
||||||
d.waitForGuestSessionReady = func(_ context.Context, _ model.VMRecord, session model.GuestSession) (model.GuestSession, error) {
|
d.waitForGuestSessionReady = func(_ context.Context, _ model.VMRecord, session model.GuestSession) (model.GuestSession, error) {
|
||||||
|
|
|
||||||
|
|
@ -98,6 +98,10 @@ func (o operationLog) log(level slog.Level, msg string, attrs ...any) {
|
||||||
o.logger.Log(context.Background(), level, msg, base...)
|
o.logger.Log(context.Background(), level, msg, base...)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// vmLogAttrs returns the durable identifying fields for a VM that
|
||||||
|
// are always safe to log. Transient handles (PID, tap device) moved
|
||||||
|
// off VMRecord when the schema was split; lifecycle ops log those
|
||||||
|
// explicitly on the events where they matter (e.g. wait_for_exit).
|
||||||
func vmLogAttrs(vm model.VMRecord) []any {
|
func vmLogAttrs(vm model.VMRecord) []any {
|
||||||
attrs := []any{
|
attrs := []any{
|
||||||
"vm_id", vm.ID,
|
"vm_id", vm.ID,
|
||||||
|
|
@ -107,15 +111,9 @@ func vmLogAttrs(vm model.VMRecord) []any {
|
||||||
if vm.Runtime.GuestIP != "" {
|
if vm.Runtime.GuestIP != "" {
|
||||||
attrs = append(attrs, "guest_ip", vm.Runtime.GuestIP)
|
attrs = append(attrs, "guest_ip", vm.Runtime.GuestIP)
|
||||||
}
|
}
|
||||||
if vm.Runtime.TapDevice != "" {
|
|
||||||
attrs = append(attrs, "tap_device", vm.Runtime.TapDevice)
|
|
||||||
}
|
|
||||||
if vm.Runtime.APISockPath != "" {
|
if vm.Runtime.APISockPath != "" {
|
||||||
attrs = append(attrs, "api_socket", vm.Runtime.APISockPath)
|
attrs = append(attrs, "api_socket", vm.Runtime.APISockPath)
|
||||||
}
|
}
|
||||||
if vm.Runtime.PID > 0 {
|
|
||||||
attrs = append(attrs, "pid", vm.Runtime.PID)
|
|
||||||
}
|
|
||||||
if vm.Runtime.LogPath != "" {
|
if vm.Runtime.LogPath != "" {
|
||||||
attrs = append(attrs, "log_path", vm.Runtime.LogPath)
|
attrs = append(attrs, "log_path", vm.Runtime.LogPath)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -11,7 +11,7 @@ import (
|
||||||
type natRule = hostnat.Rule
|
type natRule = hostnat.Rule
|
||||||
|
|
||||||
func (d *Daemon) ensureNAT(ctx context.Context, vm model.VMRecord, enable bool) error {
|
func (d *Daemon) ensureNAT(ctx context.Context, vm model.VMRecord, enable bool) error {
|
||||||
return hostnat.Ensure(ctx, d.runner, vm.Runtime.GuestIP, vm.Runtime.TapDevice, enable)
|
return hostnat.Ensure(ctx, d.runner, vm.Runtime.GuestIP, d.vmHandles(vm.ID).TapDevice, enable)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (d *Daemon) validateNATPrereqs(ctx context.Context) (string, error) {
|
func (d *Daemon) validateNATPrereqs(ctx context.Context) (string, error) {
|
||||||
|
|
@ -32,8 +32,12 @@ func parseDefaultUplink(output string) (string, error) {
|
||||||
return hostnat.ParseDefaultUplink(output)
|
return hostnat.ParseDefaultUplink(output)
|
||||||
}
|
}
|
||||||
|
|
||||||
func natRulesForVM(vm model.VMRecord, uplink string) ([]natRule, error) {
|
// natRulesForVM builds the iptables rule set for vm + tap + uplink.
|
||||||
return hostnat.Rules(vm.Runtime.GuestIP, vm.Runtime.TapDevice, uplink)
|
// tap is passed explicitly (rather than read from a handle cache)
|
||||||
|
// because natRulesForVM has no Daemon receiver — it's usable from
|
||||||
|
// test helpers that build rule expectations without a daemon.
|
||||||
|
func natRulesForVM(vm model.VMRecord, tap, uplink string) ([]natRule, error) {
|
||||||
|
return hostnat.Rules(vm.Runtime.GuestIP, tap, uplink)
|
||||||
}
|
}
|
||||||
|
|
||||||
func natRuleArgs(action string, rule natRule) []string {
|
func natRuleArgs(action string, rule natRule) []string {
|
||||||
|
|
|
||||||
|
|
@ -33,11 +33,10 @@ func TestNATRulesForVM(t *testing.T) {
|
||||||
|
|
||||||
vm := model.VMRecord{
|
vm := model.VMRecord{
|
||||||
Runtime: model.VMRuntime{
|
Runtime: model.VMRuntime{
|
||||||
GuestIP: "172.16.0.8",
|
GuestIP: "172.16.0.8",
|
||||||
TapDevice: "tap-fc-abcd1234",
|
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
rules, err := natRulesForVM(vm, "wlan0")
|
rules, err := natRulesForVM(vm, "tap-fc-abcd1234", "wlan0")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("natRulesForVM returned error: %v", err)
|
t.Fatalf("natRulesForVM returned error: %v", err)
|
||||||
}
|
}
|
||||||
|
|
@ -61,30 +60,25 @@ func TestNATRulesForVMRequiresRuntimeData(t *testing.T) {
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
name string
|
name string
|
||||||
vm model.VMRecord
|
vm model.VMRecord
|
||||||
|
tap string
|
||||||
uplink string
|
uplink string
|
||||||
}{
|
}{
|
||||||
{
|
{
|
||||||
name: "guest ip",
|
name: "guest ip",
|
||||||
vm: model.VMRecord{
|
vm: model.VMRecord{},
|
||||||
Runtime: model.VMRuntime{TapDevice: "tap-fc-abcd1234"},
|
tap: "tap-fc-abcd1234",
|
||||||
},
|
|
||||||
uplink: "eth0",
|
uplink: "eth0",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "tap",
|
name: "tap",
|
||||||
vm: model.VMRecord{
|
vm: model.VMRecord{Runtime: model.VMRuntime{GuestIP: "172.16.0.8"}},
|
||||||
Runtime: model.VMRuntime{GuestIP: "172.16.0.8"},
|
tap: "",
|
||||||
},
|
|
||||||
uplink: "eth0",
|
uplink: "eth0",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "uplink",
|
name: "uplink",
|
||||||
vm: model.VMRecord{
|
vm: model.VMRecord{Runtime: model.VMRuntime{GuestIP: "172.16.0.8"}},
|
||||||
Runtime: model.VMRuntime{
|
tap: "tap-fc-abcd1234",
|
||||||
GuestIP: "172.16.0.8",
|
|
||||||
TapDevice: "tap-fc-abcd1234",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
uplink: "",
|
uplink: "",
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
@ -93,7 +87,7 @@ func TestNATRulesForVMRequiresRuntimeData(t *testing.T) {
|
||||||
tt := tt
|
tt := tt
|
||||||
t.Run(tt.name, func(t *testing.T) {
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
if _, err := natRulesForVM(tt.vm, tt.uplink); err == nil {
|
if _, err := natRulesForVM(tt.vm, tt.tap, tt.uplink); err == nil {
|
||||||
t.Fatalf("expected natRulesForVM to fail for missing %s", tt.name)
|
t.Fatalf("expected natRulesForVM to fail for missing %s", tt.name)
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
|
||||||
|
|
@ -15,7 +15,6 @@ import (
|
||||||
|
|
||||||
"banger/internal/api"
|
"banger/internal/api"
|
||||||
"banger/internal/model"
|
"banger/internal/model"
|
||||||
"banger/internal/system"
|
|
||||||
"banger/internal/vmdns"
|
"banger/internal/vmdns"
|
||||||
"banger/internal/vsockagent"
|
"banger/internal/vsockagent"
|
||||||
)
|
)
|
||||||
|
|
@ -29,7 +28,7 @@ func (d *Daemon) PortsVM(ctx context.Context, idOrName string) (result api.VMPor
|
||||||
if result.DNSName == "" && strings.TrimSpace(vm.Name) != "" {
|
if result.DNSName == "" && strings.TrimSpace(vm.Name) != "" {
|
||||||
result.DNSName = vmdns.RecordName(vm.Name)
|
result.DNSName = vmdns.RecordName(vm.Name)
|
||||||
}
|
}
|
||||||
if vm.State != model.VMStateRunning || !system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
|
if !d.vmAlive(vm) {
|
||||||
return model.VMRecord{}, fmt.Errorf("vm %s is not running", vm.Name)
|
return model.VMRecord{}, fmt.Errorf("vm %s is not running", vm.Name)
|
||||||
}
|
}
|
||||||
if strings.TrimSpace(vm.Runtime.GuestIP) == "" {
|
if strings.TrimSpace(vm.Runtime.GuestIP) == "" {
|
||||||
|
|
|
||||||
|
|
@ -15,7 +15,6 @@ import (
|
||||||
"banger/internal/guest"
|
"banger/internal/guest"
|
||||||
"banger/internal/model"
|
"banger/internal/model"
|
||||||
"banger/internal/sessionstream"
|
"banger/internal/sessionstream"
|
||||||
"banger/internal/system"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
func (d *Daemon) BeginGuestSessionAttach(ctx context.Context, params api.GuestSessionAttachBeginParams) (api.GuestSessionAttachBeginResult, error) {
|
func (d *Daemon) BeginGuestSessionAttach(ctx context.Context, params api.GuestSessionAttachBeginParams) (api.GuestSessionAttachBeginResult, error) {
|
||||||
|
|
@ -162,7 +161,7 @@ func (d *Daemon) attachGuestSessionBridge(session model.GuestSession, controller
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
if vm.State != model.VMStateRunning || !system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
|
if !d.vmAlive(vm) {
|
||||||
return fmt.Errorf("vm %q is not running", vm.Name)
|
return fmt.Errorf("vm %q is not running", vm.Name)
|
||||||
}
|
}
|
||||||
address := net.JoinHostPort(vm.Runtime.GuestIP, "22")
|
address := net.JoinHostPort(vm.Runtime.GuestIP, "22")
|
||||||
|
|
|
||||||
|
|
@ -13,7 +13,6 @@ import (
|
||||||
sess "banger/internal/daemon/session"
|
sess "banger/internal/daemon/session"
|
||||||
"banger/internal/guest"
|
"banger/internal/guest"
|
||||||
"banger/internal/model"
|
"banger/internal/model"
|
||||||
"banger/internal/system"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
func (d *Daemon) StartGuestSession(ctx context.Context, params api.GuestSessionStartParams) (model.GuestSession, error) {
|
func (d *Daemon) StartGuestSession(ctx context.Context, params api.GuestSessionStartParams) (model.GuestSession, error) {
|
||||||
|
|
@ -29,7 +28,7 @@ func (d *Daemon) StartGuestSession(ctx context.Context, params api.GuestSessionS
|
||||||
}
|
}
|
||||||
var created model.GuestSession
|
var created model.GuestSession
|
||||||
_, err := d.withVMLockByRef(ctx, params.VMIDOrName, func(vm model.VMRecord) (model.VMRecord, error) {
|
_, err := d.withVMLockByRef(ctx, params.VMIDOrName, func(vm model.VMRecord) (model.VMRecord, error) {
|
||||||
if vm.State != model.VMStateRunning || !system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
|
if !d.vmAlive(vm) {
|
||||||
return model.VMRecord{}, fmt.Errorf("vm %q is not running", vm.Name)
|
return model.VMRecord{}, fmt.Errorf("vm %q is not running", vm.Name)
|
||||||
}
|
}
|
||||||
session, err := d.startGuestSessionLocked(ctx, vm, params, stdinMode)
|
session, err := d.startGuestSessionLocked(ctx, vm, params, stdinMode)
|
||||||
|
|
@ -184,7 +183,7 @@ func (d *Daemon) signalGuestSession(ctx context.Context, params api.GuestSession
|
||||||
if session.Status == model.GuestSessionStatusExited || session.Status == model.GuestSessionStatusFailed {
|
if session.Status == model.GuestSessionStatusExited || session.Status == model.GuestSessionStatusFailed {
|
||||||
return session, nil
|
return session, nil
|
||||||
}
|
}
|
||||||
if vm.State != model.VMStateRunning || !system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
|
if !d.vmAlive(vm) {
|
||||||
session.Status = model.GuestSessionStatusFailed
|
session.Status = model.GuestSessionStatusFailed
|
||||||
session.LastError = "vm is not running"
|
session.LastError = "vm is not running"
|
||||||
now := model.Now()
|
now := model.Now()
|
||||||
|
|
|
||||||
|
|
@ -59,7 +59,7 @@ func (d *Daemon) SendToGuestSession(ctx context.Context, params api.GuestSession
|
||||||
if session.Status != model.GuestSessionStatusRunning {
|
if session.Status != model.GuestSessionStatusRunning {
|
||||||
return api.GuestSessionSendResult{}, fmt.Errorf("session is not running (status=%s)", session.Status)
|
return api.GuestSessionSendResult{}, fmt.Errorf("session is not running (status=%s)", session.Status)
|
||||||
}
|
}
|
||||||
if vm.State != model.VMStateRunning || !system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
|
if !d.vmAlive(vm) {
|
||||||
return api.GuestSessionSendResult{}, fmt.Errorf("vm %q is not running", vm.Name)
|
return api.GuestSessionSendResult{}, fmt.Errorf("vm %q is not running", vm.Name)
|
||||||
}
|
}
|
||||||
if len(params.Payload) == 0 {
|
if len(params.Payload) == 0 {
|
||||||
|
|
@ -89,7 +89,7 @@ func (d *Daemon) SendToGuestSession(ctx context.Context, params api.GuestSession
|
||||||
}
|
}
|
||||||
|
|
||||||
func (d *Daemon) readGuestSessionLog(ctx context.Context, vm model.VMRecord, session model.GuestSession, stream string, tailLines int) (string, error) {
|
func (d *Daemon) readGuestSessionLog(ctx context.Context, vm model.VMRecord, session model.GuestSession, stream string, tailLines int) (string, error) {
|
||||||
if vm.State == model.VMStateRunning && system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
|
if d.vmAlive(vm) {
|
||||||
client, err := guest.Dial(ctx, net.JoinHostPort(vm.Runtime.GuestIP, "22"), d.config.SSHKeyPath)
|
client, err := guest.Dial(ctx, net.JoinHostPort(vm.Runtime.GuestIP, "22"), d.config.SSHKeyPath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
|
|
|
||||||
|
|
@ -28,7 +28,7 @@ func (d *Daemon) initializeTapPool(ctx context.Context) error {
|
||||||
}
|
}
|
||||||
next := 0
|
next := 0
|
||||||
for _, vm := range vms {
|
for _, vm := range vms {
|
||||||
if index, ok := parseTapPoolIndex(vm.Runtime.TapDevice); ok && index >= next {
|
if index, ok := parseTapPoolIndex(d.vmHandles(vm.ID).TapDevice); ok && index >= next {
|
||||||
next = index + 1
|
next = index + 1
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -85,7 +85,8 @@ func (d *Daemon) cleanupRuntime(ctx context.Context, vm model.VMRecord, preserve
|
||||||
if d.logger != nil {
|
if d.logger != nil {
|
||||||
d.logger.Debug("cleanup runtime", append(vmLogAttrs(vm), "preserve_disks", preserveDisks)...)
|
d.logger.Debug("cleanup runtime", append(vmLogAttrs(vm), "preserve_disks", preserveDisks)...)
|
||||||
}
|
}
|
||||||
cleanupPID := vm.Runtime.PID
|
h := d.vmHandles(vm.ID)
|
||||||
|
cleanupPID := h.PID
|
||||||
if vm.Runtime.APISockPath != "" {
|
if vm.Runtime.APISockPath != "" {
|
||||||
if pid, err := d.findFirecrackerPID(ctx, vm.Runtime.APISockPath); err == nil && pid > 0 {
|
if pid, err := d.findFirecrackerPID(ctx, vm.Runtime.APISockPath); err == nil && pid > 0 {
|
||||||
cleanupPID = pid
|
cleanupPID = pid
|
||||||
|
|
@ -98,15 +99,15 @@ func (d *Daemon) cleanupRuntime(ctx context.Context, vm model.VMRecord, preserve
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
snapshotErr := d.cleanupDMSnapshot(ctx, dmSnapshotHandles{
|
snapshotErr := d.cleanupDMSnapshot(ctx, dmSnapshotHandles{
|
||||||
BaseLoop: vm.Runtime.BaseLoop,
|
BaseLoop: h.BaseLoop,
|
||||||
COWLoop: vm.Runtime.COWLoop,
|
COWLoop: h.COWLoop,
|
||||||
DMName: vm.Runtime.DMName,
|
DMName: h.DMName,
|
||||||
DMDev: vm.Runtime.DMDev,
|
DMDev: h.DMDev,
|
||||||
})
|
})
|
||||||
featureErr := d.cleanupCapabilityState(ctx, vm)
|
featureErr := d.cleanupCapabilityState(ctx, vm)
|
||||||
var tapErr error
|
var tapErr error
|
||||||
if vm.Runtime.TapDevice != "" {
|
if h.TapDevice != "" {
|
||||||
tapErr = d.releaseTap(ctx, vm.Runtime.TapDevice)
|
tapErr = d.releaseTap(ctx, h.TapDevice)
|
||||||
}
|
}
|
||||||
if vm.Runtime.APISockPath != "" {
|
if vm.Runtime.APISockPath != "" {
|
||||||
_ = os.Remove(vm.Runtime.APISockPath)
|
_ = os.Remove(vm.Runtime.APISockPath)
|
||||||
|
|
@ -114,22 +115,16 @@ func (d *Daemon) cleanupRuntime(ctx context.Context, vm model.VMRecord, preserve
|
||||||
if vm.Runtime.VSockPath != "" {
|
if vm.Runtime.VSockPath != "" {
|
||||||
_ = os.Remove(vm.Runtime.VSockPath)
|
_ = os.Remove(vm.Runtime.VSockPath)
|
||||||
}
|
}
|
||||||
|
// The handles are only meaningful while the kernel objects exist;
|
||||||
|
// dropping them here keeps the cache in sync with reality even
|
||||||
|
// when the caller forgets to call clearVMHandles explicitly.
|
||||||
|
d.clearVMHandles(vm)
|
||||||
if !preserveDisks && vm.Runtime.VMDir != "" {
|
if !preserveDisks && vm.Runtime.VMDir != "" {
|
||||||
return errors.Join(snapshotErr, featureErr, tapErr, os.RemoveAll(vm.Runtime.VMDir))
|
return errors.Join(snapshotErr, featureErr, tapErr, os.RemoveAll(vm.Runtime.VMDir))
|
||||||
}
|
}
|
||||||
return errors.Join(snapshotErr, featureErr, tapErr)
|
return errors.Join(snapshotErr, featureErr, tapErr)
|
||||||
}
|
}
|
||||||
|
|
||||||
func clearRuntimeHandles(vm *model.VMRecord) {
|
|
||||||
vm.Runtime.PID = 0
|
|
||||||
vm.Runtime.APISockPath = ""
|
|
||||||
vm.Runtime.TapDevice = ""
|
|
||||||
vm.Runtime.BaseLoop = ""
|
|
||||||
vm.Runtime.COWLoop = ""
|
|
||||||
vm.Runtime.DMName = ""
|
|
||||||
vm.Runtime.DMDev = ""
|
|
||||||
}
|
|
||||||
|
|
||||||
func defaultVSockPath(runtimeDir, vmID string) string {
|
func defaultVSockPath(runtimeDir, vmID string) string {
|
||||||
return filepath.Join(runtimeDir, "fc-"+system.ShortID(vmID)+".vsock")
|
return filepath.Join(runtimeDir, "fc-"+system.ShortID(vmID)+".vsock")
|
||||||
}
|
}
|
||||||
|
|
@ -205,10 +200,7 @@ func (d *Daemon) rebuildDNS(ctx context.Context) error {
|
||||||
}
|
}
|
||||||
records := make(map[string]string)
|
records := make(map[string]string)
|
||||||
for _, vm := range vms {
|
for _, vm := range vms {
|
||||||
if vm.State != model.VMStateRunning {
|
if !d.vmAlive(vm) {
|
||||||
continue
|
|
||||||
}
|
|
||||||
if !system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if strings.TrimSpace(vm.Runtime.GuestIP) == "" {
|
if strings.TrimSpace(vm.Runtime.GuestIP) == "" {
|
||||||
|
|
|
||||||
|
|
@ -26,12 +26,20 @@ func (d *Daemon) ensureSystemOverlay(ctx context.Context, vm *model.VMRecord) er
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// patchRootOverlay writes the per-VM config files (resolv.conf,
|
||||||
|
// hostname, hosts, sshd drop-in, network bootstrap, fstab) into the
|
||||||
|
// rootfs overlay. Reads the DM device path from the handle cache,
|
||||||
|
// which the start flow populates before calling this.
|
||||||
func (d *Daemon) patchRootOverlay(ctx context.Context, vm model.VMRecord, image model.Image) error {
|
func (d *Daemon) patchRootOverlay(ctx context.Context, vm model.VMRecord, image model.Image) error {
|
||||||
|
dmDev := d.vmHandles(vm.ID).DMDev
|
||||||
|
if dmDev == "" {
|
||||||
|
return fmt.Errorf("vm %q: DM device not in handle cache — start flow out of order?", vm.ID)
|
||||||
|
}
|
||||||
resolv := []byte(fmt.Sprintf("nameserver %s\n", d.config.DefaultDNS))
|
resolv := []byte(fmt.Sprintf("nameserver %s\n", d.config.DefaultDNS))
|
||||||
hostname := []byte(vm.Name + "\n")
|
hostname := []byte(vm.Name + "\n")
|
||||||
hosts := []byte(fmt.Sprintf("127.0.0.1 localhost\n127.0.1.1 %s\n", vm.Name))
|
hosts := []byte(fmt.Sprintf("127.0.0.1 localhost\n127.0.1.1 %s\n", vm.Name))
|
||||||
sshdConfig := []byte(sshdGuestConfig())
|
sshdConfig := []byte(sshdGuestConfig())
|
||||||
fstab, err := system.ReadDebugFSText(ctx, d.runner, vm.Runtime.DMDev, "/etc/fstab")
|
fstab, err := system.ReadDebugFSText(ctx, d.runner, dmDev, "/etc/fstab")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
fstab = ""
|
fstab = ""
|
||||||
}
|
}
|
||||||
|
|
@ -66,12 +74,12 @@ func (d *Daemon) patchRootOverlay(ctx context.Context, vm model.VMRecord, image
|
||||||
for _, guestPath := range builder.FilePaths() {
|
for _, guestPath := range builder.FilePaths() {
|
||||||
data := files[guestPath]
|
data := files[guestPath]
|
||||||
if guestPath == guestnet.GuestScriptPath {
|
if guestPath == guestnet.GuestScriptPath {
|
||||||
if err := system.WriteExt4FileMode(ctx, d.runner, vm.Runtime.DMDev, guestPath, 0o755, data); err != nil {
|
if err := system.WriteExt4FileMode(ctx, d.runner, dmDev, guestPath, 0o755, data); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if err := system.WriteExt4File(ctx, d.runner, vm.Runtime.DMDev, guestPath, data); err != nil {
|
if err := system.WriteExt4File(ctx, d.runner, dmDev, guestPath, data); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -109,7 +117,11 @@ func (d *Daemon) ensureWorkDisk(ctx context.Context, vm *model.VMRecord, image m
|
||||||
if _, err := d.runner.Run(ctx, "mkfs.ext4", "-F", vm.Runtime.WorkDiskPath); err != nil {
|
if _, err := d.runner.Run(ctx, "mkfs.ext4", "-F", vm.Runtime.WorkDiskPath); err != nil {
|
||||||
return workDiskPreparation{}, err
|
return workDiskPreparation{}, err
|
||||||
}
|
}
|
||||||
rootMount, cleanupRoot, err := system.MountTempDir(ctx, d.runner, vm.Runtime.DMDev, true)
|
dmDev := d.vmHandles(vm.ID).DMDev
|
||||||
|
if dmDev == "" {
|
||||||
|
return workDiskPreparation{}, fmt.Errorf("vm %q: DM device not in handle cache", vm.ID)
|
||||||
|
}
|
||||||
|
rootMount, cleanupRoot, err := system.MountTempDir(ctx, d.runner, dmDev, true)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return workDiskPreparation{}, err
|
return workDiskPreparation{}, err
|
||||||
}
|
}
|
||||||
|
|
|
||||||
211
internal/daemon/vm_handles.go
Normal file
211
internal/daemon/vm_handles.go
Normal file
|
|
@ -0,0 +1,211 @@
|
||||||
|
package daemon
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"sync"
|
||||||
|
|
||||||
|
"banger/internal/model"
|
||||||
|
"banger/internal/system"
|
||||||
|
)
|
||||||
|
|
||||||
|
// handleCache is the daemon's in-memory map of per-VM transient
|
||||||
|
// handles. It is the sole runtime source of truth for PID / tap /
|
||||||
|
// loop / DM state — persistent storage (the per-VM handles.json
|
||||||
|
// scratch file) exists only so the daemon can rebuild the cache
|
||||||
|
// after a restart.
|
||||||
|
type handleCache struct {
|
||||||
|
mu sync.RWMutex
|
||||||
|
m map[string]model.VMHandles
|
||||||
|
}
|
||||||
|
|
||||||
|
func newHandleCache() *handleCache {
|
||||||
|
return &handleCache{m: make(map[string]model.VMHandles)}
|
||||||
|
}
|
||||||
|
|
||||||
|
// get returns the cached handles for vmID and whether an entry
|
||||||
|
// exists. A missing entry means "no live handles tracked," which is
|
||||||
|
// the correct state for stopped VMs.
|
||||||
|
func (c *handleCache) get(vmID string) (model.VMHandles, bool) {
|
||||||
|
c.mu.RLock()
|
||||||
|
defer c.mu.RUnlock()
|
||||||
|
h, ok := c.m[vmID]
|
||||||
|
return h, ok
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *handleCache) set(vmID string, h model.VMHandles) {
|
||||||
|
c.mu.Lock()
|
||||||
|
defer c.mu.Unlock()
|
||||||
|
c.m[vmID] = h
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *handleCache) clear(vmID string) {
|
||||||
|
c.mu.Lock()
|
||||||
|
defer c.mu.Unlock()
|
||||||
|
delete(c.m, vmID)
|
||||||
|
}
|
||||||
|
|
||||||
|
// handlesFilePath returns the scratch file path inside the VM
|
||||||
|
// directory where the daemon writes the last-known handles.
|
||||||
|
func handlesFilePath(vmDir string) string {
|
||||||
|
return filepath.Join(vmDir, "handles.json")
|
||||||
|
}
|
||||||
|
|
||||||
|
// writeHandlesFile persists h to <vmDir>/handles.json. Called
|
||||||
|
// whenever the daemon successfully transitions a VM to running
|
||||||
|
// (after all handles are acquired). Best-effort: a write failure is
|
||||||
|
// logged, not propagated — the in-memory cache is authoritative
|
||||||
|
// while the daemon is up.
|
||||||
|
func writeHandlesFile(vmDir string, h model.VMHandles) error {
|
||||||
|
if vmDir == "" {
|
||||||
|
return errors.New("vm dir is required")
|
||||||
|
}
|
||||||
|
if err := os.MkdirAll(vmDir, 0o755); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
data, err := json.MarshalIndent(h, "", " ")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return os.WriteFile(handlesFilePath(vmDir), data, 0o600)
|
||||||
|
}
|
||||||
|
|
||||||
|
// readHandlesFile loads the scratch file written at the last start.
|
||||||
|
// Returns a zero-value handles + (false, nil) if the file doesn't
|
||||||
|
// exist — that's the normal case for stopped VMs.
|
||||||
|
func readHandlesFile(vmDir string) (model.VMHandles, bool, error) {
|
||||||
|
if vmDir == "" {
|
||||||
|
return model.VMHandles{}, false, nil
|
||||||
|
}
|
||||||
|
data, err := os.ReadFile(handlesFilePath(vmDir))
|
||||||
|
if os.IsNotExist(err) {
|
||||||
|
return model.VMHandles{}, false, nil
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
return model.VMHandles{}, false, err
|
||||||
|
}
|
||||||
|
var h model.VMHandles
|
||||||
|
if err := json.Unmarshal(data, &h); err != nil {
|
||||||
|
return model.VMHandles{}, false, fmt.Errorf("parse handles.json: %w", err)
|
||||||
|
}
|
||||||
|
return h, true, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func removeHandlesFile(vmDir string) {
|
||||||
|
if vmDir == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
_ = os.Remove(handlesFilePath(vmDir))
|
||||||
|
}
|
||||||
|
|
||||||
|
// ensureHandleCache lazily constructs the cache so direct
|
||||||
|
// `&Daemon{}` literals (common in tests) don't have to initialise
|
||||||
|
// it. Production code goes through Open(), which also builds it.
|
||||||
|
func (d *Daemon) ensureHandleCache() {
|
||||||
|
if d.handles == nil {
|
||||||
|
d.handles = newHandleCache()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// setVMHandlesInMemory is a test-only cache seed that skips the
|
||||||
|
// scratch-file write. Production callers should use setVMHandles so
|
||||||
|
// the filesystem survives a daemon restart.
|
||||||
|
func (d *Daemon) setVMHandlesInMemory(vmID string, h model.VMHandles) {
|
||||||
|
if d == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
d.ensureHandleCache()
|
||||||
|
d.handles.set(vmID, h)
|
||||||
|
}
|
||||||
|
|
||||||
|
// vmHandles returns the cached handles for vm (zero-value if no
|
||||||
|
// entry). Call sites that previously read `vm.Runtime.{PID,...}`
|
||||||
|
// should read through this instead.
|
||||||
|
func (d *Daemon) vmHandles(vmID string) model.VMHandles {
|
||||||
|
if d == nil {
|
||||||
|
return model.VMHandles{}
|
||||||
|
}
|
||||||
|
d.ensureHandleCache()
|
||||||
|
h, _ := d.handles.get(vmID)
|
||||||
|
return h
|
||||||
|
}
|
||||||
|
|
||||||
|
// setVMHandles updates the in-memory cache AND the per-VM scratch
|
||||||
|
// file. Scratch-file errors are logged but not returned; the cache
|
||||||
|
// write is authoritative while the daemon is alive.
|
||||||
|
func (d *Daemon) setVMHandles(vm model.VMRecord, h model.VMHandles) {
|
||||||
|
if d == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
d.ensureHandleCache()
|
||||||
|
d.handles.set(vm.ID, h)
|
||||||
|
if err := writeHandlesFile(vm.Runtime.VMDir, h); err != nil && d.logger != nil {
|
||||||
|
d.logger.Warn("persist handles.json failed", "vm_id", vm.ID, "error", err.Error())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// clearVMHandles drops the cache entry and removes the scratch
|
||||||
|
// file. Called on stop / delete / after a failed start.
|
||||||
|
func (d *Daemon) clearVMHandles(vm model.VMRecord) {
|
||||||
|
if d == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
d.ensureHandleCache()
|
||||||
|
d.handles.clear(vm.ID)
|
||||||
|
removeHandlesFile(vm.Runtime.VMDir)
|
||||||
|
}
|
||||||
|
|
||||||
|
// vmAlive is the canonical "is this VM actually running?" check.
|
||||||
|
// Unlike the old `system.ProcessRunning(vm.Runtime.PID, apiSock)`
|
||||||
|
// pattern, this reads the PID from the handle cache — which is
|
||||||
|
// authoritative in-process — and verifies the PID against the api
|
||||||
|
// socket so a recycled PID can't false-positive.
|
||||||
|
func (d *Daemon) vmAlive(vm model.VMRecord) bool {
|
||||||
|
if vm.State != model.VMStateRunning {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
h := d.vmHandles(vm.ID)
|
||||||
|
if h.PID <= 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return system.ProcessRunning(h.PID, vm.Runtime.APISockPath)
|
||||||
|
}
|
||||||
|
|
||||||
|
// rediscoverHandles loads what the last daemon start knew about a VM
|
||||||
|
// from its handles.json scratch file and verifies the firecracker
|
||||||
|
// process is still alive. Returns:
|
||||||
|
//
|
||||||
|
// - handles: the scratch-file contents (zero-value if no file).
|
||||||
|
// ALWAYS returned, even when alive=false, because the caller
|
||||||
|
// needs them to tear down kernel state (dm-snapshot, loops, tap)
|
||||||
|
// that the previous daemon left behind when it died.
|
||||||
|
// - alive: true iff a firecracker process matching the api sock is
|
||||||
|
// currently running.
|
||||||
|
// - err: unexpected failure (file exists but is corrupt).
|
||||||
|
//
|
||||||
|
// Strategy: pgrep by api sock path first (handles the case where
|
||||||
|
// the daemon crashed but the PID changed on respawn — unlikely for
|
||||||
|
// firecracker, but cheap insurance); fall back to verifying the
|
||||||
|
// scratch file's PID directly.
|
||||||
|
func (d *Daemon) rediscoverHandles(ctx context.Context, vm model.VMRecord) (model.VMHandles, bool, error) {
|
||||||
|
saved, _, err := readHandlesFile(vm.Runtime.VMDir)
|
||||||
|
if err != nil {
|
||||||
|
return model.VMHandles{}, false, err
|
||||||
|
}
|
||||||
|
apiSock := vm.Runtime.APISockPath
|
||||||
|
if apiSock == "" {
|
||||||
|
return saved, false, nil
|
||||||
|
}
|
||||||
|
if pid, pidErr := d.findFirecrackerPID(ctx, apiSock); pidErr == nil && pid > 0 {
|
||||||
|
saved.PID = pid
|
||||||
|
return saved, true, nil
|
||||||
|
}
|
||||||
|
if saved.PID > 0 && system.ProcessRunning(saved.PID, apiSock) {
|
||||||
|
return saved, true, nil
|
||||||
|
}
|
||||||
|
return saved, false, nil
|
||||||
|
}
|
||||||
197
internal/daemon/vm_handles_test.go
Normal file
197
internal/daemon/vm_handles_test.go
Normal file
|
|
@ -0,0 +1,197 @@
|
||||||
|
package daemon
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"banger/internal/model"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestHandlesFileRoundtrip(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
dir := t.TempDir()
|
||||||
|
want := model.VMHandles{
|
||||||
|
PID: 4242,
|
||||||
|
TapDevice: "tap-fc-abcd",
|
||||||
|
BaseLoop: "/dev/loop9",
|
||||||
|
COWLoop: "/dev/loop10",
|
||||||
|
DMName: "fc-rootfs-abcd",
|
||||||
|
DMDev: "/dev/mapper/fc-rootfs-abcd",
|
||||||
|
}
|
||||||
|
if err := writeHandlesFile(dir, want); err != nil {
|
||||||
|
t.Fatalf("writeHandlesFile: %v", err)
|
||||||
|
}
|
||||||
|
got, present, err := readHandlesFile(dir)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("readHandlesFile: %v", err)
|
||||||
|
}
|
||||||
|
if !present {
|
||||||
|
t.Fatal("readHandlesFile reported no file after write")
|
||||||
|
}
|
||||||
|
if got != want {
|
||||||
|
t.Fatalf("roundtrip mismatch:\n got %+v\n want %+v", got, want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestHandlesFileMissingReturnsZero(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
h, present, err := readHandlesFile(t.TempDir())
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("readHandlesFile (missing): %v", err)
|
||||||
|
}
|
||||||
|
if present {
|
||||||
|
t.Fatal("present = true for missing file")
|
||||||
|
}
|
||||||
|
if !h.IsZero() {
|
||||||
|
t.Fatalf("expected zero-value handles, got %+v", h)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestHandlesFileCorruptReturnsError(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
dir := t.TempDir()
|
||||||
|
if err := os.WriteFile(handlesFilePath(dir), []byte("{not json"), 0o600); err != nil {
|
||||||
|
t.Fatalf("WriteFile: %v", err)
|
||||||
|
}
|
||||||
|
if _, _, err := readHandlesFile(dir); err == nil {
|
||||||
|
t.Fatal("expected parse error for corrupt file")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestHandleCacheConcurrent(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
c := newHandleCache()
|
||||||
|
done := make(chan struct{})
|
||||||
|
// One writer, multiple readers — prove the RWMutex usage.
|
||||||
|
go func() {
|
||||||
|
for i := 0; i < 1000; i++ {
|
||||||
|
c.set("vm-1", model.VMHandles{PID: i})
|
||||||
|
}
|
||||||
|
close(done)
|
||||||
|
}()
|
||||||
|
for i := 0; i < 1000; i++ {
|
||||||
|
_, _ = c.get("vm-1")
|
||||||
|
}
|
||||||
|
<-done
|
||||||
|
c.clear("vm-1")
|
||||||
|
if _, ok := c.get("vm-1"); ok {
|
||||||
|
t.Fatal("cache entry still present after clear")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestRediscoverHandlesLoadsScratchWhenProcessDead proves the stale-
|
||||||
|
// cleanup path: the firecracker process is gone, but the scratch
|
||||||
|
// file tells us which kernel resources the previous daemon still
|
||||||
|
// owes us a teardown on.
|
||||||
|
func TestRediscoverHandlesLoadsScratchWhenProcessDead(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
vmDir := t.TempDir()
|
||||||
|
apiSock := filepath.Join(t.TempDir(), "fc.sock")
|
||||||
|
stale := model.VMHandles{
|
||||||
|
PID: 999999,
|
||||||
|
BaseLoop: "/dev/loop99",
|
||||||
|
COWLoop: "/dev/loop100",
|
||||||
|
DMName: "fc-rootfs-gone",
|
||||||
|
DMDev: "/dev/mapper/fc-rootfs-gone",
|
||||||
|
}
|
||||||
|
if err := writeHandlesFile(vmDir, stale); err != nil {
|
||||||
|
t.Fatalf("writeHandlesFile: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// A scripted runner that reports "no such process" when reconcile
|
||||||
|
// probes via pgrep.
|
||||||
|
runner := &scriptedRunner{
|
||||||
|
t: t,
|
||||||
|
steps: []runnerStep{
|
||||||
|
{call: runnerCall{name: "pgrep", args: []string{"-n", "-f", apiSock}}, err: &exitErr{code: 1}},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
d := &Daemon{runner: runner}
|
||||||
|
vm := testVM("gone", "image-gone", "172.16.0.250")
|
||||||
|
vm.Runtime.APISockPath = apiSock
|
||||||
|
vm.Runtime.VMDir = vmDir
|
||||||
|
|
||||||
|
got, alive, err := d.rediscoverHandles(context.Background(), vm)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("rediscoverHandles: %v", err)
|
||||||
|
}
|
||||||
|
if alive {
|
||||||
|
t.Fatal("alive = true, want false (process dead)")
|
||||||
|
}
|
||||||
|
// Even when dead, the scratch handles must be returned so
|
||||||
|
// cleanupRuntime can tear DM + loops + tap down.
|
||||||
|
if got.DMName != stale.DMName || got.BaseLoop != stale.BaseLoop || got.COWLoop != stale.COWLoop {
|
||||||
|
t.Fatalf("stale handles lost: got %+v, want fields from %+v", got, stale)
|
||||||
|
}
|
||||||
|
runner.assertExhausted()
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestRediscoverHandlesPrefersLivePIDOverScratch: scratch file has an
|
||||||
|
// old PID, but pgrep finds the actual current PID via the api sock.
|
||||||
|
func TestRediscoverHandlesPrefersLivePIDOverScratch(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
vmDir := t.TempDir()
|
||||||
|
apiSock := filepath.Join(t.TempDir(), "fc.sock")
|
||||||
|
if err := writeHandlesFile(vmDir, model.VMHandles{PID: 111, DMName: "dm-x"}); err != nil {
|
||||||
|
t.Fatalf("writeHandlesFile: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
runner := &scriptedRunner{
|
||||||
|
t: t,
|
||||||
|
steps: []runnerStep{
|
||||||
|
{call: runnerCall{name: "pgrep", args: []string{"-n", "-f", apiSock}}, out: []byte("222\n")},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
d := &Daemon{runner: runner}
|
||||||
|
vm := testVM("moved", "image-moved", "172.16.0.251")
|
||||||
|
vm.Runtime.APISockPath = apiSock
|
||||||
|
vm.Runtime.VMDir = vmDir
|
||||||
|
|
||||||
|
got, alive, err := d.rediscoverHandles(context.Background(), vm)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("rediscoverHandles: %v", err)
|
||||||
|
}
|
||||||
|
if !alive {
|
||||||
|
t.Fatal("alive = false, want true (pgrep found a PID)")
|
||||||
|
}
|
||||||
|
if got.PID != 222 {
|
||||||
|
t.Fatalf("PID = %d, want 222 (from pgrep, not scratch)", got.PID)
|
||||||
|
}
|
||||||
|
if got.DMName != "dm-x" {
|
||||||
|
t.Fatalf("scratch fields dropped: %+v", got)
|
||||||
|
}
|
||||||
|
runner.assertExhausted()
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestClearVMHandlesRemovesScratchFile proves the cleanup contract.
|
||||||
|
func TestClearVMHandlesRemovesScratchFile(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
vmDir := t.TempDir()
|
||||||
|
if err := writeHandlesFile(vmDir, model.VMHandles{PID: 42}); err != nil {
|
||||||
|
t.Fatalf("writeHandlesFile: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
d := &Daemon{}
|
||||||
|
vm := testVM("sweep", "image-sweep", "172.16.0.252")
|
||||||
|
vm.Runtime.VMDir = vmDir
|
||||||
|
d.setVMHandlesInMemory(vm.ID, model.VMHandles{PID: 42})
|
||||||
|
d.clearVMHandles(vm)
|
||||||
|
|
||||||
|
if _, err := os.Stat(handlesFilePath(vmDir)); !os.IsNotExist(err) {
|
||||||
|
t.Fatalf("scratch file still present: %v", err)
|
||||||
|
}
|
||||||
|
if h, ok := d.handles.get(vm.ID); ok && !h.IsZero() {
|
||||||
|
t.Fatalf("cache entry survives clear: %+v", h)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// exitErr is a minimal stand-in for an exec-style non-zero exit.
|
||||||
|
// Used by scripted runners to simulate "pgrep found nothing".
|
||||||
|
type exitErr struct{ code int }
|
||||||
|
|
||||||
|
func (e *exitErr) Error() string { return "exit status " + strings.Repeat("1", 1) }
|
||||||
|
|
@ -22,7 +22,7 @@ func (d *Daemon) StartVM(ctx context.Context, idOrName string) (model.VMRecord,
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return model.VMRecord{}, err
|
return model.VMRecord{}, err
|
||||||
}
|
}
|
||||||
if vm.State == model.VMStateRunning && system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
|
if d.vmAlive(vm) {
|
||||||
if d.logger != nil {
|
if d.logger != nil {
|
||||||
d.logger.Info("vm already running", vmLogAttrs(vm)...)
|
d.logger.Info("vm already running", vmLogAttrs(vm)...)
|
||||||
}
|
}
|
||||||
|
|
@ -54,7 +54,7 @@ func (d *Daemon) startVMLocked(ctx context.Context, vm model.VMRecord, image mod
|
||||||
if err := d.cleanupRuntime(ctx, vm, true); err != nil {
|
if err := d.cleanupRuntime(ctx, vm, true); err != nil {
|
||||||
return model.VMRecord{}, err
|
return model.VMRecord{}, err
|
||||||
}
|
}
|
||||||
clearRuntimeHandles(&vm)
|
d.clearVMHandles(vm)
|
||||||
op.stage("bridge")
|
op.stage("bridge")
|
||||||
if err := d.ensureBridge(ctx); err != nil {
|
if err := d.ensureBridge(ctx); err != nil {
|
||||||
return model.VMRecord{}, err
|
return model.VMRecord{}, err
|
||||||
|
|
@ -92,14 +92,23 @@ func (d *Daemon) startVMLocked(ctx context.Context, vm model.VMRecord, image mod
|
||||||
|
|
||||||
op.stage("dm_snapshot", "dm_name", dmName)
|
op.stage("dm_snapshot", "dm_name", dmName)
|
||||||
vmCreateStage(ctx, "prepare_rootfs", "creating root filesystem snapshot")
|
vmCreateStage(ctx, "prepare_rootfs", "creating root filesystem snapshot")
|
||||||
handles, err := d.createDMSnapshot(ctx, image.RootfsPath, vm.Runtime.SystemOverlay, dmName)
|
snapHandles, err := d.createDMSnapshot(ctx, image.RootfsPath, vm.Runtime.SystemOverlay, dmName)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return model.VMRecord{}, err
|
return model.VMRecord{}, err
|
||||||
}
|
}
|
||||||
vm.Runtime.BaseLoop = handles.BaseLoop
|
// Live handles are threaded through this function as a local and
|
||||||
vm.Runtime.COWLoop = handles.COWLoop
|
// pushed to the cache via setVMHandles once we have every piece.
|
||||||
vm.Runtime.DMName = handles.DMName
|
// The cache update must happen BEFORE any step that reads handles
|
||||||
vm.Runtime.DMDev = handles.DMDev
|
// back (e.g. cleanupRuntime via cleanupOnErr) — otherwise loops
|
||||||
|
// and DM would leak on an early failure.
|
||||||
|
live := model.VMHandles{
|
||||||
|
BaseLoop: snapHandles.BaseLoop,
|
||||||
|
COWLoop: snapHandles.COWLoop,
|
||||||
|
DMName: snapHandles.DMName,
|
||||||
|
DMDev: snapHandles.DMDev,
|
||||||
|
}
|
||||||
|
d.setVMHandles(vm, live)
|
||||||
|
|
||||||
vm.Runtime.APISockPath = apiSock
|
vm.Runtime.APISockPath = apiSock
|
||||||
vm.Runtime.State = model.VMStateRunning
|
vm.Runtime.State = model.VMStateRunning
|
||||||
vm.State = model.VMStateRunning
|
vm.State = model.VMStateRunning
|
||||||
|
|
@ -113,7 +122,7 @@ func (d *Daemon) startVMLocked(ctx context.Context, vm model.VMRecord, image mod
|
||||||
if cleanupErr := d.cleanupRuntime(context.Background(), vm, true); cleanupErr != nil {
|
if cleanupErr := d.cleanupRuntime(context.Background(), vm, true); cleanupErr != nil {
|
||||||
err = errors.Join(err, cleanupErr)
|
err = errors.Join(err, cleanupErr)
|
||||||
}
|
}
|
||||||
clearRuntimeHandles(&vm)
|
d.clearVMHandles(vm)
|
||||||
_ = d.store.UpsertVM(context.Background(), vm)
|
_ = d.store.UpsertVM(context.Background(), vm)
|
||||||
return model.VMRecord{}, err
|
return model.VMRecord{}, err
|
||||||
}
|
}
|
||||||
|
|
@ -133,7 +142,8 @@ func (d *Daemon) startVMLocked(ctx context.Context, vm model.VMRecord, image mod
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return cleanupOnErr(err)
|
return cleanupOnErr(err)
|
||||||
}
|
}
|
||||||
vm.Runtime.TapDevice = tap
|
live.TapDevice = tap
|
||||||
|
d.setVMHandles(vm, live)
|
||||||
op.stage("metrics_file", "metrics_path", vm.Runtime.MetricsPath)
|
op.stage("metrics_file", "metrics_path", vm.Runtime.MetricsPath)
|
||||||
if err := os.WriteFile(vm.Runtime.MetricsPath, nil, 0o644); err != nil {
|
if err := os.WriteFile(vm.Runtime.MetricsPath, nil, 0o644); err != nil {
|
||||||
return cleanupOnErr(err)
|
return cleanupOnErr(err)
|
||||||
|
|
@ -170,7 +180,7 @@ func (d *Daemon) startVMLocked(ctx context.Context, vm model.VMRecord, image mod
|
||||||
KernelArgs: kernelArgs,
|
KernelArgs: kernelArgs,
|
||||||
Drives: []firecracker.DriveConfig{{
|
Drives: []firecracker.DriveConfig{{
|
||||||
ID: "rootfs",
|
ID: "rootfs",
|
||||||
Path: vm.Runtime.DMDev,
|
Path: live.DMDev,
|
||||||
ReadOnly: false,
|
ReadOnly: false,
|
||||||
IsRoot: true,
|
IsRoot: true,
|
||||||
}},
|
}},
|
||||||
|
|
@ -190,11 +200,13 @@ func (d *Daemon) startVMLocked(ctx context.Context, vm model.VMRecord, image mod
|
||||||
// Use a fresh context: the request ctx may already be cancelled (client
|
// Use a fresh context: the request ctx may already be cancelled (client
|
||||||
// disconnect), but we still need the PID so cleanupRuntime can kill the
|
// disconnect), but we still need the PID so cleanupRuntime can kill the
|
||||||
// Firecracker process that was spawned before the failure.
|
// Firecracker process that was spawned before the failure.
|
||||||
vm.Runtime.PID = d.resolveFirecrackerPID(context.Background(), machine, apiSock)
|
live.PID = d.resolveFirecrackerPID(context.Background(), machine, apiSock)
|
||||||
|
d.setVMHandles(vm, live)
|
||||||
return cleanupOnErr(err)
|
return cleanupOnErr(err)
|
||||||
}
|
}
|
||||||
vm.Runtime.PID = d.resolveFirecrackerPID(context.Background(), machine, apiSock)
|
live.PID = d.resolveFirecrackerPID(context.Background(), machine, apiSock)
|
||||||
op.debugStage("firecracker_started", "pid", vm.Runtime.PID)
|
d.setVMHandles(vm, live)
|
||||||
|
op.debugStage("firecracker_started", "pid", live.PID)
|
||||||
op.stage("socket_access", "api_socket", apiSock)
|
op.stage("socket_access", "api_socket", apiSock)
|
||||||
if err := d.ensureSocketAccess(ctx, apiSock, "firecracker api socket"); err != nil {
|
if err := d.ensureSocketAccess(ctx, apiSock, "firecracker api socket"); err != nil {
|
||||||
return cleanupOnErr(err)
|
return cleanupOnErr(err)
|
||||||
|
|
@ -237,29 +249,30 @@ func (d *Daemon) stopVMLocked(ctx context.Context, current model.VMRecord) (vm m
|
||||||
}
|
}
|
||||||
op.done(vmLogAttrs(vm)...)
|
op.done(vmLogAttrs(vm)...)
|
||||||
}()
|
}()
|
||||||
if vm.State != model.VMStateRunning || !system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
|
if !d.vmAlive(vm) {
|
||||||
op.stage("cleanup_stale_runtime")
|
op.stage("cleanup_stale_runtime")
|
||||||
if err := d.cleanupRuntime(ctx, vm, true); err != nil {
|
if err := d.cleanupRuntime(ctx, vm, true); err != nil {
|
||||||
return model.VMRecord{}, err
|
return model.VMRecord{}, err
|
||||||
}
|
}
|
||||||
vm.State = model.VMStateStopped
|
vm.State = model.VMStateStopped
|
||||||
vm.Runtime.State = model.VMStateStopped
|
vm.Runtime.State = model.VMStateStopped
|
||||||
clearRuntimeHandles(&vm)
|
d.clearVMHandles(vm)
|
||||||
if err := d.store.UpsertVM(ctx, vm); err != nil {
|
if err := d.store.UpsertVM(ctx, vm); err != nil {
|
||||||
return model.VMRecord{}, err
|
return model.VMRecord{}, err
|
||||||
}
|
}
|
||||||
return vm, nil
|
return vm, nil
|
||||||
}
|
}
|
||||||
|
pid := d.vmHandles(vm.ID).PID
|
||||||
op.stage("graceful_shutdown")
|
op.stage("graceful_shutdown")
|
||||||
if err := d.sendCtrlAltDel(ctx, vm); err != nil {
|
if err := d.sendCtrlAltDel(ctx, vm); err != nil {
|
||||||
return model.VMRecord{}, err
|
return model.VMRecord{}, err
|
||||||
}
|
}
|
||||||
op.stage("wait_for_exit", "pid", vm.Runtime.PID)
|
op.stage("wait_for_exit", "pid", pid)
|
||||||
if err := d.waitForExit(ctx, vm.Runtime.PID, vm.Runtime.APISockPath, gracefulShutdownWait); err != nil {
|
if err := d.waitForExit(ctx, pid, vm.Runtime.APISockPath, gracefulShutdownWait); err != nil {
|
||||||
if !errors.Is(err, errWaitForExitTimeout) {
|
if !errors.Is(err, errWaitForExitTimeout) {
|
||||||
return model.VMRecord{}, err
|
return model.VMRecord{}, err
|
||||||
}
|
}
|
||||||
op.stage("graceful_shutdown_timeout", "pid", vm.Runtime.PID)
|
op.stage("graceful_shutdown_timeout", "pid", pid)
|
||||||
}
|
}
|
||||||
op.stage("cleanup_runtime")
|
op.stage("cleanup_runtime")
|
||||||
if err := d.cleanupRuntime(ctx, vm, true); err != nil {
|
if err := d.cleanupRuntime(ctx, vm, true); err != nil {
|
||||||
|
|
@ -267,7 +280,7 @@ func (d *Daemon) stopVMLocked(ctx context.Context, current model.VMRecord) (vm m
|
||||||
}
|
}
|
||||||
vm.State = model.VMStateStopped
|
vm.State = model.VMStateStopped
|
||||||
vm.Runtime.State = model.VMStateStopped
|
vm.Runtime.State = model.VMStateStopped
|
||||||
clearRuntimeHandles(&vm)
|
d.clearVMHandles(vm)
|
||||||
system.TouchNow(&vm)
|
system.TouchNow(&vm)
|
||||||
if err := d.store.UpsertVM(ctx, vm); err != nil {
|
if err := d.store.UpsertVM(ctx, vm); err != nil {
|
||||||
return model.VMRecord{}, err
|
return model.VMRecord{}, err
|
||||||
|
|
@ -291,14 +304,14 @@ func (d *Daemon) killVMLocked(ctx context.Context, current model.VMRecord, signa
|
||||||
}
|
}
|
||||||
op.done(vmLogAttrs(vm)...)
|
op.done(vmLogAttrs(vm)...)
|
||||||
}()
|
}()
|
||||||
if vm.State != model.VMStateRunning || !system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
|
if !d.vmAlive(vm) {
|
||||||
op.stage("cleanup_stale_runtime")
|
op.stage("cleanup_stale_runtime")
|
||||||
if err := d.cleanupRuntime(ctx, vm, true); err != nil {
|
if err := d.cleanupRuntime(ctx, vm, true); err != nil {
|
||||||
return model.VMRecord{}, err
|
return model.VMRecord{}, err
|
||||||
}
|
}
|
||||||
vm.State = model.VMStateStopped
|
vm.State = model.VMStateStopped
|
||||||
vm.Runtime.State = model.VMStateStopped
|
vm.Runtime.State = model.VMStateStopped
|
||||||
clearRuntimeHandles(&vm)
|
d.clearVMHandles(vm)
|
||||||
if err := d.store.UpsertVM(ctx, vm); err != nil {
|
if err := d.store.UpsertVM(ctx, vm); err != nil {
|
||||||
return model.VMRecord{}, err
|
return model.VMRecord{}, err
|
||||||
}
|
}
|
||||||
|
|
@ -309,16 +322,17 @@ func (d *Daemon) killVMLocked(ctx context.Context, current model.VMRecord, signa
|
||||||
if signal == "" {
|
if signal == "" {
|
||||||
signal = "TERM"
|
signal = "TERM"
|
||||||
}
|
}
|
||||||
op.stage("send_signal", "pid", vm.Runtime.PID, "signal", signal)
|
pid := d.vmHandles(vm.ID).PID
|
||||||
if _, err := d.runner.RunSudo(ctx, "kill", "-"+signal, strconv.Itoa(vm.Runtime.PID)); err != nil {
|
op.stage("send_signal", "pid", pid, "signal", signal)
|
||||||
|
if _, err := d.runner.RunSudo(ctx, "kill", "-"+signal, strconv.Itoa(pid)); err != nil {
|
||||||
return model.VMRecord{}, err
|
return model.VMRecord{}, err
|
||||||
}
|
}
|
||||||
op.stage("wait_for_exit", "pid", vm.Runtime.PID)
|
op.stage("wait_for_exit", "pid", pid)
|
||||||
if err := d.waitForExit(ctx, vm.Runtime.PID, vm.Runtime.APISockPath, 30*time.Second); err != nil {
|
if err := d.waitForExit(ctx, pid, vm.Runtime.APISockPath, 30*time.Second); err != nil {
|
||||||
if !errors.Is(err, errWaitForExitTimeout) {
|
if !errors.Is(err, errWaitForExitTimeout) {
|
||||||
return model.VMRecord{}, err
|
return model.VMRecord{}, err
|
||||||
}
|
}
|
||||||
op.stage("signal_timeout", "pid", vm.Runtime.PID, "signal", signal)
|
op.stage("signal_timeout", "pid", pid, "signal", signal)
|
||||||
}
|
}
|
||||||
op.stage("cleanup_runtime")
|
op.stage("cleanup_runtime")
|
||||||
if err := d.cleanupRuntime(ctx, vm, true); err != nil {
|
if err := d.cleanupRuntime(ctx, vm, true); err != nil {
|
||||||
|
|
@ -326,7 +340,7 @@ func (d *Daemon) killVMLocked(ctx context.Context, current model.VMRecord, signa
|
||||||
}
|
}
|
||||||
vm.State = model.VMStateStopped
|
vm.State = model.VMStateStopped
|
||||||
vm.Runtime.State = model.VMStateStopped
|
vm.Runtime.State = model.VMStateStopped
|
||||||
clearRuntimeHandles(&vm)
|
d.clearVMHandles(vm)
|
||||||
system.TouchNow(&vm)
|
system.TouchNow(&vm)
|
||||||
if err := d.store.UpsertVM(ctx, vm); err != nil {
|
if err := d.store.UpsertVM(ctx, vm); err != nil {
|
||||||
return model.VMRecord{}, err
|
return model.VMRecord{}, err
|
||||||
|
|
@ -378,9 +392,10 @@ func (d *Daemon) deleteVMLocked(ctx context.Context, current model.VMRecord) (vm
|
||||||
}
|
}
|
||||||
op.done(vmLogAttrs(vm)...)
|
op.done(vmLogAttrs(vm)...)
|
||||||
}()
|
}()
|
||||||
if vm.State == model.VMStateRunning && system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
|
if d.vmAlive(vm) {
|
||||||
op.stage("kill_running_vm", "pid", vm.Runtime.PID)
|
pid := d.vmHandles(vm.ID).PID
|
||||||
_ = d.killVMProcess(ctx, vm.Runtime.PID)
|
op.stage("kill_running_vm", "pid", pid)
|
||||||
|
_ = d.killVMProcess(ctx, pid)
|
||||||
}
|
}
|
||||||
op.stage("cleanup_runtime")
|
op.stage("cleanup_runtime")
|
||||||
if err := d.cleanupRuntime(ctx, vm, false); err != nil {
|
if err := d.cleanupRuntime(ctx, vm, false); err != nil {
|
||||||
|
|
|
||||||
|
|
@ -25,7 +25,7 @@ func (d *Daemon) setVMLocked(ctx context.Context, current model.VMRecord, params
|
||||||
}
|
}
|
||||||
op.done(vmLogAttrs(vm)...)
|
op.done(vmLogAttrs(vm)...)
|
||||||
}()
|
}()
|
||||||
running := vm.State == model.VMStateRunning && system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath)
|
running := d.vmAlive(vm)
|
||||||
if params.VCPUCount != nil {
|
if params.VCPUCount != nil {
|
||||||
if err := validateOptionalPositiveSetting("vcpu", params.VCPUCount); err != nil {
|
if err := validateOptionalPositiveSetting("vcpu", params.VCPUCount); err != nil {
|
||||||
return model.VMRecord{}, err
|
return model.VMRecord{}, err
|
||||||
|
|
|
||||||
|
|
@ -25,7 +25,7 @@ func (d *Daemon) GetVMStats(ctx context.Context, idOrName string) (model.VMRecor
|
||||||
func (d *Daemon) HealthVM(ctx context.Context, idOrName string) (result api.VMHealthResult, err error) {
|
func (d *Daemon) HealthVM(ctx context.Context, idOrName string) (result api.VMHealthResult, err error) {
|
||||||
_, err = d.withVMLockByRef(ctx, idOrName, func(vm model.VMRecord) (model.VMRecord, error) {
|
_, err = d.withVMLockByRef(ctx, idOrName, func(vm model.VMRecord) (model.VMRecord, error) {
|
||||||
result.Name = vm.Name
|
result.Name = vm.Name
|
||||||
if vm.State != model.VMStateRunning || !system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
|
if !d.vmAlive(vm) {
|
||||||
result.Healthy = false
|
result.Healthy = false
|
||||||
return vm, nil
|
return vm, nil
|
||||||
}
|
}
|
||||||
|
|
@ -77,7 +77,7 @@ func (d *Daemon) pollStats(ctx context.Context) error {
|
||||||
}
|
}
|
||||||
for _, vm := range vms {
|
for _, vm := range vms {
|
||||||
if err := d.withVMLockByIDErr(ctx, vm.ID, func(vm model.VMRecord) error {
|
if err := d.withVMLockByIDErr(ctx, vm.ID, func(vm model.VMRecord) error {
|
||||||
if vm.State != model.VMStateRunning || !system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
|
if !d.vmAlive(vm) {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
stats, err := d.collectStats(ctx, vm)
|
stats, err := d.collectStats(ctx, vm)
|
||||||
|
|
@ -116,7 +116,7 @@ func (d *Daemon) stopStaleVMs(ctx context.Context) (err error) {
|
||||||
now := model.Now()
|
now := model.Now()
|
||||||
for _, vm := range vms {
|
for _, vm := range vms {
|
||||||
if err := d.withVMLockByIDErr(ctx, vm.ID, func(vm model.VMRecord) error {
|
if err := d.withVMLockByIDErr(ctx, vm.ID, func(vm model.VMRecord) error {
|
||||||
if vm.State != model.VMStateRunning || !system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
|
if !d.vmAlive(vm) {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
if now.Sub(vm.LastTouchedAt) < d.config.AutoStopStaleAfter {
|
if now.Sub(vm.LastTouchedAt) < d.config.AutoStopStaleAfter {
|
||||||
|
|
@ -124,11 +124,11 @@ func (d *Daemon) stopStaleVMs(ctx context.Context) (err error) {
|
||||||
}
|
}
|
||||||
op.stage("stopping_vm", vmLogAttrs(vm)...)
|
op.stage("stopping_vm", vmLogAttrs(vm)...)
|
||||||
_ = d.sendCtrlAltDel(ctx, vm)
|
_ = d.sendCtrlAltDel(ctx, vm)
|
||||||
_ = d.waitForExit(ctx, vm.Runtime.PID, vm.Runtime.APISockPath, 10*time.Second)
|
_ = d.waitForExit(ctx, d.vmHandles(vm.ID).PID, vm.Runtime.APISockPath, 10*time.Second)
|
||||||
_ = d.cleanupRuntime(ctx, vm, true)
|
_ = d.cleanupRuntime(ctx, vm, true)
|
||||||
vm.State = model.VMStateStopped
|
vm.State = model.VMStateStopped
|
||||||
vm.Runtime.State = model.VMStateStopped
|
vm.Runtime.State = model.VMStateStopped
|
||||||
clearRuntimeHandles(&vm)
|
d.clearVMHandles(vm)
|
||||||
vm.UpdatedAt = model.Now()
|
vm.UpdatedAt = model.Now()
|
||||||
return d.store.UpsertVM(ctx, vm)
|
return d.store.UpsertVM(ctx, vm)
|
||||||
}); err != nil {
|
}); err != nil {
|
||||||
|
|
@ -145,9 +145,8 @@ func (d *Daemon) collectStats(ctx context.Context, vm model.VMRecord) (model.VMS
|
||||||
WorkDiskBytes: system.AllocatedBytes(vm.Runtime.WorkDiskPath),
|
WorkDiskBytes: system.AllocatedBytes(vm.Runtime.WorkDiskPath),
|
||||||
MetricsRaw: system.ParseMetricsFile(vm.Runtime.MetricsPath),
|
MetricsRaw: system.ParseMetricsFile(vm.Runtime.MetricsPath),
|
||||||
}
|
}
|
||||||
if vm.Runtime.PID > 0 && system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
|
if d.vmAlive(vm) {
|
||||||
ps, err := system.ReadProcessStats(ctx, vm.Runtime.PID)
|
if ps, err := system.ReadProcessStats(ctx, d.vmHandles(vm.ID).PID); err == nil {
|
||||||
if err == nil {
|
|
||||||
stats.CPUPercent = ps.CPUPercent
|
stats.CPUPercent = ps.CPUPercent
|
||||||
stats.RSSBytes = ps.RSSBytes
|
stats.RSSBytes = ps.RSSBytes
|
||||||
stats.VSZBytes = ps.VSZBytes
|
stats.VSZBytes = ps.VSZBytes
|
||||||
|
|
|
||||||
|
|
@ -112,21 +112,36 @@ func TestReconcileStopsStaleRunningVMAndClearsRuntimeHandles(t *testing.T) {
|
||||||
if err := os.WriteFile(apiSock, []byte{}, 0o644); err != nil {
|
if err := os.WriteFile(apiSock, []byte{}, 0o644); err != nil {
|
||||||
t.Fatalf("WriteFile(api sock): %v", err)
|
t.Fatalf("WriteFile(api sock): %v", err)
|
||||||
}
|
}
|
||||||
|
vmDir := t.TempDir()
|
||||||
vm := testVM("stale", "image-stale", "172.16.0.9")
|
vm := testVM("stale", "image-stale", "172.16.0.9")
|
||||||
vm.State = model.VMStateRunning
|
vm.State = model.VMStateRunning
|
||||||
vm.Runtime.State = model.VMStateRunning
|
vm.Runtime.State = model.VMStateRunning
|
||||||
vm.Runtime.PID = 999999
|
|
||||||
vm.Runtime.APISockPath = apiSock
|
vm.Runtime.APISockPath = apiSock
|
||||||
vm.Runtime.DMName = "fc-rootfs-stale"
|
vm.Runtime.VMDir = vmDir
|
||||||
vm.Runtime.DMDev = "/dev/mapper/fc-rootfs-stale"
|
|
||||||
vm.Runtime.COWLoop = "/dev/loop11"
|
|
||||||
vm.Runtime.BaseLoop = "/dev/loop10"
|
|
||||||
vm.Runtime.DNSName = ""
|
vm.Runtime.DNSName = ""
|
||||||
upsertDaemonVM(t, ctx, db, vm)
|
upsertDaemonVM(t, ctx, db, vm)
|
||||||
|
|
||||||
|
// Simulate the prior daemon crashing while this VM was running:
|
||||||
|
// the handles.json scratch file survives and names a stale PID +
|
||||||
|
// DM snapshot. Reconcile should discover the PID is gone, tear
|
||||||
|
// the kernel state down via the runner, and clear the scratch.
|
||||||
|
stale := model.VMHandles{
|
||||||
|
PID: 999999,
|
||||||
|
BaseLoop: "/dev/loop10",
|
||||||
|
COWLoop: "/dev/loop11",
|
||||||
|
DMName: "fc-rootfs-stale",
|
||||||
|
DMDev: "/dev/mapper/fc-rootfs-stale",
|
||||||
|
}
|
||||||
|
if err := writeHandlesFile(vmDir, stale); err != nil {
|
||||||
|
t.Fatalf("writeHandlesFile: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
runner := &scriptedRunner{
|
runner := &scriptedRunner{
|
||||||
t: t,
|
t: t,
|
||||||
steps: []runnerStep{
|
steps: []runnerStep{
|
||||||
|
// First pgrep: rediscoverHandles tries to verify the PID.
|
||||||
|
{call: runnerCall{name: "pgrep", args: []string{"-n", "-f", apiSock}}, err: errors.New("exit status 1")},
|
||||||
|
// Second pgrep: cleanupRuntime asks again before killing.
|
||||||
{call: runnerCall{name: "pgrep", args: []string{"-n", "-f", apiSock}}, err: errors.New("exit status 1")},
|
{call: runnerCall{name: "pgrep", args: []string{"-n", "-f", apiSock}}, err: errors.New("exit status 1")},
|
||||||
sudoStep("", nil, "dmsetup", "remove", "fc-rootfs-stale"),
|
sudoStep("", nil, "dmsetup", "remove", "fc-rootfs-stale"),
|
||||||
sudoStep("", nil, "losetup", "-d", "/dev/loop11"),
|
sudoStep("", nil, "losetup", "-d", "/dev/loop11"),
|
||||||
|
|
@ -147,8 +162,13 @@ func TestReconcileStopsStaleRunningVMAndClearsRuntimeHandles(t *testing.T) {
|
||||||
if got.State != model.VMStateStopped || got.Runtime.State != model.VMStateStopped {
|
if got.State != model.VMStateStopped || got.Runtime.State != model.VMStateStopped {
|
||||||
t.Fatalf("vm state after reconcile = %s/%s, want stopped", got.State, got.Runtime.State)
|
t.Fatalf("vm state after reconcile = %s/%s, want stopped", got.State, got.Runtime.State)
|
||||||
}
|
}
|
||||||
if got.Runtime.PID != 0 || got.Runtime.APISockPath != "" || got.Runtime.DMName != "" || got.Runtime.COWLoop != "" || got.Runtime.BaseLoop != "" {
|
// The scratch file must be gone — stopped VMs don't carry handles.
|
||||||
t.Fatalf("runtime handles not cleared after reconcile: %+v", got.Runtime)
|
if _, err := os.Stat(handlesFilePath(vmDir)); !os.IsNotExist(err) {
|
||||||
|
t.Fatalf("handles.json still present after reconcile: %v", err)
|
||||||
|
}
|
||||||
|
// And the in-memory cache must be empty.
|
||||||
|
if h, ok := d.handles.get(vm.ID); ok && !h.IsZero() {
|
||||||
|
t.Fatalf("handle cache not cleared after reconcile: %+v", h)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -168,13 +188,11 @@ func TestRebuildDNSIncludesOnlyLiveRunningVMs(t *testing.T) {
|
||||||
live := testVM("live", "image-live", "172.16.0.21")
|
live := testVM("live", "image-live", "172.16.0.21")
|
||||||
live.State = model.VMStateRunning
|
live.State = model.VMStateRunning
|
||||||
live.Runtime.State = model.VMStateRunning
|
live.Runtime.State = model.VMStateRunning
|
||||||
live.Runtime.PID = liveCmd.Process.Pid
|
|
||||||
live.Runtime.APISockPath = liveSock
|
live.Runtime.APISockPath = liveSock
|
||||||
|
|
||||||
stale := testVM("stale", "image-stale", "172.16.0.22")
|
stale := testVM("stale", "image-stale", "172.16.0.22")
|
||||||
stale.State = model.VMStateRunning
|
stale.State = model.VMStateRunning
|
||||||
stale.Runtime.State = model.VMStateRunning
|
stale.Runtime.State = model.VMStateRunning
|
||||||
stale.Runtime.PID = 999999
|
|
||||||
stale.Runtime.APISockPath = filepath.Join(t.TempDir(), "stale.sock")
|
stale.Runtime.APISockPath = filepath.Join(t.TempDir(), "stale.sock")
|
||||||
|
|
||||||
stopped := testVM("stopped", "image-stopped", "172.16.0.23")
|
stopped := testVM("stopped", "image-stopped", "172.16.0.23")
|
||||||
|
|
@ -195,6 +213,11 @@ func TestRebuildDNSIncludesOnlyLiveRunningVMs(t *testing.T) {
|
||||||
})
|
})
|
||||||
|
|
||||||
d := &Daemon{store: db, vmDNS: server}
|
d := &Daemon{store: db, vmDNS: server}
|
||||||
|
// rebuildDNS reads the alive check from the handle cache. Seed
|
||||||
|
// the live VM with its real PID; leave the stale entry with a PID
|
||||||
|
// that definitely isn't running (999999 ≫ max PID on most hosts).
|
||||||
|
d.setVMHandlesInMemory(live.ID, model.VMHandles{PID: liveCmd.Process.Pid})
|
||||||
|
d.setVMHandlesInMemory(stale.ID, model.VMHandles{PID: 999999})
|
||||||
if err := d.rebuildDNS(ctx); err != nil {
|
if err := d.rebuildDNS(ctx); err != nil {
|
||||||
t.Fatalf("rebuildDNS: %v", err)
|
t.Fatalf("rebuildDNS: %v", err)
|
||||||
}
|
}
|
||||||
|
|
@ -225,11 +248,11 @@ func TestSetVMRejectsStoppedOnlyChangesForRunningVM(t *testing.T) {
|
||||||
vm := testVM("running", "image-run", "172.16.0.10")
|
vm := testVM("running", "image-run", "172.16.0.10")
|
||||||
vm.State = model.VMStateRunning
|
vm.State = model.VMStateRunning
|
||||||
vm.Runtime.State = model.VMStateRunning
|
vm.Runtime.State = model.VMStateRunning
|
||||||
vm.Runtime.PID = cmd.Process.Pid
|
|
||||||
vm.Runtime.APISockPath = apiSock
|
vm.Runtime.APISockPath = apiSock
|
||||||
upsertDaemonVM(t, ctx, db, vm)
|
upsertDaemonVM(t, ctx, db, vm)
|
||||||
|
|
||||||
d := &Daemon{store: db}
|
d := &Daemon{store: db}
|
||||||
|
d.setVMHandlesInMemory(vm.ID, model.VMHandles{PID: cmd.Process.Pid})
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
name string
|
name string
|
||||||
params api.VMSetParams
|
params api.VMSetParams
|
||||||
|
|
@ -330,12 +353,12 @@ func TestHealthVMReturnsHealthyForRunningGuest(t *testing.T) {
|
||||||
vm := testVM("alive", "image-alive", "172.16.0.41")
|
vm := testVM("alive", "image-alive", "172.16.0.41")
|
||||||
vm.State = model.VMStateRunning
|
vm.State = model.VMStateRunning
|
||||||
vm.Runtime.State = model.VMStateRunning
|
vm.Runtime.State = model.VMStateRunning
|
||||||
vm.Runtime.PID = fake.Process.Pid
|
|
||||||
vm.Runtime.APISockPath = apiSock
|
vm.Runtime.APISockPath = apiSock
|
||||||
vm.Runtime.VSockPath = vsockSock
|
vm.Runtime.VSockPath = vsockSock
|
||||||
vm.Runtime.VSockCID = 10041
|
vm.Runtime.VSockCID = 10041
|
||||||
upsertDaemonVM(t, ctx, db, vm)
|
upsertDaemonVM(t, ctx, db, vm)
|
||||||
|
|
||||||
|
handlePID := fake.Process.Pid
|
||||||
runner := &scriptedRunner{
|
runner := &scriptedRunner{
|
||||||
t: t,
|
t: t,
|
||||||
steps: []runnerStep{
|
steps: []runnerStep{
|
||||||
|
|
@ -344,6 +367,7 @@ func TestHealthVMReturnsHealthyForRunningGuest(t *testing.T) {
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
d := &Daemon{store: db, runner: runner}
|
d := &Daemon{store: db, runner: runner}
|
||||||
|
d.setVMHandlesInMemory(vm.ID, model.VMHandles{PID: handlePID})
|
||||||
result, err := d.HealthVM(ctx, vm.Name)
|
result, err := d.HealthVM(ctx, vm.Name)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("HealthVM: %v", err)
|
t.Fatalf("HealthVM: %v", err)
|
||||||
|
|
@ -393,7 +417,6 @@ func TestPingVMAliasReturnsAliveForHealthyVM(t *testing.T) {
|
||||||
vm := testVM("healthy-ping", "image-healthy", "172.16.0.42")
|
vm := testVM("healthy-ping", "image-healthy", "172.16.0.42")
|
||||||
vm.State = model.VMStateRunning
|
vm.State = model.VMStateRunning
|
||||||
vm.Runtime.State = model.VMStateRunning
|
vm.Runtime.State = model.VMStateRunning
|
||||||
vm.Runtime.PID = fake.Process.Pid
|
|
||||||
vm.Runtime.APISockPath = apiSock
|
vm.Runtime.APISockPath = apiSock
|
||||||
vm.Runtime.VSockPath = vsockSock
|
vm.Runtime.VSockPath = vsockSock
|
||||||
vm.Runtime.VSockCID = 10042
|
vm.Runtime.VSockCID = 10042
|
||||||
|
|
@ -407,6 +430,7 @@ func TestPingVMAliasReturnsAliveForHealthyVM(t *testing.T) {
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
d := &Daemon{store: db, runner: runner}
|
d := &Daemon{store: db, runner: runner}
|
||||||
|
d.setVMHandlesInMemory(vm.ID, model.VMHandles{PID: fake.Process.Pid})
|
||||||
result, err := d.PingVM(ctx, vm.Name)
|
result, err := d.PingVM(ctx, vm.Name)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("PingVM: %v", err)
|
t.Fatalf("PingVM: %v", err)
|
||||||
|
|
@ -590,7 +614,6 @@ func TestPortsVMReturnsEnrichedPortsAndWebSchemes(t *testing.T) {
|
||||||
vm := testVM("ports", "image-ports", "127.0.0.1")
|
vm := testVM("ports", "image-ports", "127.0.0.1")
|
||||||
vm.State = model.VMStateRunning
|
vm.State = model.VMStateRunning
|
||||||
vm.Runtime.State = model.VMStateRunning
|
vm.Runtime.State = model.VMStateRunning
|
||||||
vm.Runtime.PID = fake.Process.Pid
|
|
||||||
vm.Runtime.APISockPath = apiSock
|
vm.Runtime.APISockPath = apiSock
|
||||||
vm.Runtime.VSockPath = vsockSock
|
vm.Runtime.VSockPath = vsockSock
|
||||||
vm.Runtime.VSockCID = 10043
|
vm.Runtime.VSockCID = 10043
|
||||||
|
|
@ -604,6 +627,7 @@ func TestPortsVMReturnsEnrichedPortsAndWebSchemes(t *testing.T) {
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
d := &Daemon{store: db, runner: runner}
|
d := &Daemon{store: db, runner: runner}
|
||||||
|
d.setVMHandlesInMemory(vm.ID, model.VMHandles{PID: fake.Process.Pid})
|
||||||
|
|
||||||
result, err := d.PortsVM(ctx, vm.Name)
|
result, err := d.PortsVM(ctx, vm.Name)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|
@ -1341,8 +1365,10 @@ func TestCleanupRuntimeRediscoversLiveFirecrackerPID(t *testing.T) {
|
||||||
}
|
}
|
||||||
d := &Daemon{runner: runner}
|
d := &Daemon{runner: runner}
|
||||||
vm := testVM("cleanup", "image-cleanup", "172.16.0.22")
|
vm := testVM("cleanup", "image-cleanup", "172.16.0.22")
|
||||||
vm.Runtime.PID = fake.Process.Pid + 999
|
|
||||||
vm.Runtime.APISockPath = apiSock
|
vm.Runtime.APISockPath = apiSock
|
||||||
|
// Seed a stale PID so cleanupRuntime's findFirecrackerPID pgrep
|
||||||
|
// fallback wins — it rediscovers fake.Process.Pid from apiSock.
|
||||||
|
d.setVMHandlesInMemory(vm.ID, model.VMHandles{PID: fake.Process.Pid + 999})
|
||||||
|
|
||||||
if err := d.cleanupRuntime(context.Background(), vm, true); err != nil {
|
if err := d.cleanupRuntime(context.Background(), vm, true); err != nil {
|
||||||
t.Fatalf("cleanupRuntime returned error: %v", err)
|
t.Fatalf("cleanupRuntime returned error: %v", err)
|
||||||
|
|
@ -1366,7 +1392,6 @@ func TestDeleteStoppedNATVMDoesNotFailWithoutTapDevice(t *testing.T) {
|
||||||
vm := testVM("stopped-nat", "image-stopped-nat", "172.16.0.24")
|
vm := testVM("stopped-nat", "image-stopped-nat", "172.16.0.24")
|
||||||
vm.Spec.NATEnabled = true
|
vm.Spec.NATEnabled = true
|
||||||
vm.Runtime.VMDir = vmDir
|
vm.Runtime.VMDir = vmDir
|
||||||
vm.Runtime.TapDevice = ""
|
|
||||||
vm.State = model.VMStateStopped
|
vm.State = model.VMStateStopped
|
||||||
vm.Runtime.State = model.VMStateStopped
|
vm.Runtime.State = model.VMStateStopped
|
||||||
upsertDaemonVM(t, ctx, db, vm)
|
upsertDaemonVM(t, ctx, db, vm)
|
||||||
|
|
@ -1410,7 +1435,6 @@ func TestStopVMFallsBackToForcedCleanupAfterGracefulTimeout(t *testing.T) {
|
||||||
vm := testVM("stubborn", "image-stubborn", "172.16.0.23")
|
vm := testVM("stubborn", "image-stubborn", "172.16.0.23")
|
||||||
vm.State = model.VMStateRunning
|
vm.State = model.VMStateRunning
|
||||||
vm.Runtime.State = model.VMStateRunning
|
vm.Runtime.State = model.VMStateRunning
|
||||||
vm.Runtime.PID = fake.Process.Pid
|
|
||||||
vm.Runtime.APISockPath = apiSock
|
vm.Runtime.APISockPath = apiSock
|
||||||
upsertDaemonVM(t, ctx, db, vm)
|
upsertDaemonVM(t, ctx, db, vm)
|
||||||
|
|
||||||
|
|
@ -1427,6 +1451,7 @@ func TestStopVMFallsBackToForcedCleanupAfterGracefulTimeout(t *testing.T) {
|
||||||
proc: fake,
|
proc: fake,
|
||||||
}
|
}
|
||||||
d := &Daemon{store: db, runner: runner}
|
d := &Daemon{store: db, runner: runner}
|
||||||
|
d.setVMHandlesInMemory(vm.ID, model.VMHandles{PID: fake.Process.Pid})
|
||||||
|
|
||||||
got, err := d.StopVM(ctx, vm.ID)
|
got, err := d.StopVM(ctx, vm.ID)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|
@ -1436,8 +1461,11 @@ func TestStopVMFallsBackToForcedCleanupAfterGracefulTimeout(t *testing.T) {
|
||||||
if got.State != model.VMStateStopped || got.Runtime.State != model.VMStateStopped {
|
if got.State != model.VMStateStopped || got.Runtime.State != model.VMStateStopped {
|
||||||
t.Fatalf("StopVM state = %s/%s, want stopped", got.State, got.Runtime.State)
|
t.Fatalf("StopVM state = %s/%s, want stopped", got.State, got.Runtime.State)
|
||||||
}
|
}
|
||||||
if got.Runtime.PID != 0 || got.Runtime.APISockPath != "" {
|
// APISockPath + VSock paths are deterministic — they stay on the
|
||||||
t.Fatalf("runtime handles not cleared: %+v", got.Runtime)
|
// record for debugging and next-start reuse even after stop. The
|
||||||
|
// post-stop invariant is that the in-memory cache is empty.
|
||||||
|
if h, ok := d.handles.get(vm.ID); ok && !h.IsZero() {
|
||||||
|
t.Fatalf("handle cache not cleared: %+v", h)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -13,7 +13,6 @@ import (
|
||||||
sess "banger/internal/daemon/session"
|
sess "banger/internal/daemon/session"
|
||||||
ws "banger/internal/daemon/workspace"
|
ws "banger/internal/daemon/workspace"
|
||||||
"banger/internal/model"
|
"banger/internal/model"
|
||||||
"banger/internal/system"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// Test seams. Tests swap these to observe or stall the guest-I/O
|
// Test seams. Tests swap these to observe or stall the guest-I/O
|
||||||
|
|
@ -33,7 +32,7 @@ func (d *Daemon) ExportVMWorkspace(ctx context.Context, params api.WorkspaceExpo
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return api.WorkspaceExportResult{}, err
|
return api.WorkspaceExportResult{}, err
|
||||||
}
|
}
|
||||||
if vm.State != model.VMStateRunning || !system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
|
if !d.vmAlive(vm) {
|
||||||
return api.WorkspaceExportResult{}, fmt.Errorf("vm %q is not running", vm.Name)
|
return api.WorkspaceExportResult{}, fmt.Errorf("vm %q is not running", vm.Name)
|
||||||
}
|
}
|
||||||
// Serialise with any in-flight workspace.prepare on the same VM so
|
// Serialise with any in-flight workspace.prepare on the same VM so
|
||||||
|
|
@ -133,7 +132,7 @@ func (d *Daemon) PrepareVMWorkspace(ctx context.Context, params api.VMWorkspaceP
|
||||||
// before any SSH or tar I/O so this slow operation cannot block
|
// before any SSH or tar I/O so this slow operation cannot block
|
||||||
// vm stop / vm delete / vm restart on the same VM.
|
// vm stop / vm delete / vm restart on the same VM.
|
||||||
vm, err := d.withVMLockByRef(ctx, params.IDOrName, func(vm model.VMRecord) (model.VMRecord, error) {
|
vm, err := d.withVMLockByRef(ctx, params.IDOrName, func(vm model.VMRecord) (model.VMRecord, error) {
|
||||||
if vm.State != model.VMStateRunning || !system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
|
if !d.vmAlive(vm) {
|
||||||
return model.VMRecord{}, fmt.Errorf("vm %q is not running", vm.Name)
|
return model.VMRecord{}, fmt.Errorf("vm %q is not running", vm.Name)
|
||||||
}
|
}
|
||||||
return vm, nil
|
return vm, nil
|
||||||
|
|
|
||||||
|
|
@ -81,7 +81,6 @@ func TestExportVMWorkspace_HappyPath(t *testing.T) {
|
||||||
vm := testVM("exportbox", "image-export", "172.16.0.100")
|
vm := testVM("exportbox", "image-export", "172.16.0.100")
|
||||||
vm.State = model.VMStateRunning
|
vm.State = model.VMStateRunning
|
||||||
vm.Runtime.State = model.VMStateRunning
|
vm.Runtime.State = model.VMStateRunning
|
||||||
vm.Runtime.PID = firecracker.Process.Pid
|
|
||||||
vm.Runtime.APISockPath = apiSock
|
vm.Runtime.APISockPath = apiSock
|
||||||
|
|
||||||
patch := []byte("diff --git a/file.go b/file.go\nindex 0000000..1111111 100644\n")
|
patch := []byte("diff --git a/file.go b/file.go\nindex 0000000..1111111 100644\n")
|
||||||
|
|
@ -95,6 +94,7 @@ func TestExportVMWorkspace_HappyPath(t *testing.T) {
|
||||||
}
|
}
|
||||||
d := newExportTestDaemonStore(t, fake)
|
d := newExportTestDaemonStore(t, fake)
|
||||||
upsertDaemonVM(t, ctx, d.store, vm)
|
upsertDaemonVM(t, ctx, d.store, vm)
|
||||||
|
d.setVMHandlesInMemory(vm.ID, model.VMHandles{PID: firecracker.Process.Pid})
|
||||||
|
|
||||||
result, err := d.ExportVMWorkspace(ctx, api.WorkspaceExportParams{
|
result, err := d.ExportVMWorkspace(ctx, api.WorkspaceExportParams{
|
||||||
IDOrName: vm.Name,
|
IDOrName: vm.Name,
|
||||||
|
|
@ -139,7 +139,6 @@ func TestExportVMWorkspace_WithBaseCommit(t *testing.T) {
|
||||||
vm := testVM("exportbox-base", "image-export", "172.16.0.105")
|
vm := testVM("exportbox-base", "image-export", "172.16.0.105")
|
||||||
vm.State = model.VMStateRunning
|
vm.State = model.VMStateRunning
|
||||||
vm.Runtime.State = model.VMStateRunning
|
vm.Runtime.State = model.VMStateRunning
|
||||||
vm.Runtime.PID = firecracker.Process.Pid
|
|
||||||
vm.Runtime.APISockPath = apiSock
|
vm.Runtime.APISockPath = apiSock
|
||||||
|
|
||||||
// Simulate: worker committed inside the VM. Without base_commit the diff
|
// Simulate: worker committed inside the VM. Without base_commit the diff
|
||||||
|
|
@ -156,6 +155,7 @@ func TestExportVMWorkspace_WithBaseCommit(t *testing.T) {
|
||||||
}
|
}
|
||||||
d := newExportTestDaemonStore(t, fake)
|
d := newExportTestDaemonStore(t, fake)
|
||||||
upsertDaemonVM(t, ctx, d.store, vm)
|
upsertDaemonVM(t, ctx, d.store, vm)
|
||||||
|
d.setVMHandlesInMemory(vm.ID, model.VMHandles{PID: firecracker.Process.Pid})
|
||||||
|
|
||||||
const prepareCommit = "abc1234deadbeef"
|
const prepareCommit = "abc1234deadbeef"
|
||||||
result, err := d.ExportVMWorkspace(ctx, api.WorkspaceExportParams{
|
result, err := d.ExportVMWorkspace(ctx, api.WorkspaceExportParams{
|
||||||
|
|
@ -192,7 +192,6 @@ func TestExportVMWorkspace_BaseCommitFallsBackToHEAD(t *testing.T) {
|
||||||
vm := testVM("exportbox-nobase", "image-export", "172.16.0.106")
|
vm := testVM("exportbox-nobase", "image-export", "172.16.0.106")
|
||||||
vm.State = model.VMStateRunning
|
vm.State = model.VMStateRunning
|
||||||
vm.Runtime.State = model.VMStateRunning
|
vm.Runtime.State = model.VMStateRunning
|
||||||
vm.Runtime.PID = firecracker.Process.Pid
|
|
||||||
vm.Runtime.APISockPath = apiSock
|
vm.Runtime.APISockPath = apiSock
|
||||||
|
|
||||||
fake := &exportGuestClient{
|
fake := &exportGuestClient{
|
||||||
|
|
@ -203,6 +202,7 @@ func TestExportVMWorkspace_BaseCommitFallsBackToHEAD(t *testing.T) {
|
||||||
}
|
}
|
||||||
d := newExportTestDaemonStore(t, fake)
|
d := newExportTestDaemonStore(t, fake)
|
||||||
upsertDaemonVM(t, ctx, d.store, vm)
|
upsertDaemonVM(t, ctx, d.store, vm)
|
||||||
|
d.setVMHandlesInMemory(vm.ID, model.VMHandles{PID: firecracker.Process.Pid})
|
||||||
|
|
||||||
result, err := d.ExportVMWorkspace(ctx, api.WorkspaceExportParams{
|
result, err := d.ExportVMWorkspace(ctx, api.WorkspaceExportParams{
|
||||||
IDOrName: vm.Name,
|
IDOrName: vm.Name,
|
||||||
|
|
@ -231,7 +231,6 @@ func TestExportVMWorkspace_NoChanges(t *testing.T) {
|
||||||
vm := testVM("exportbox-empty", "image-export", "172.16.0.101")
|
vm := testVM("exportbox-empty", "image-export", "172.16.0.101")
|
||||||
vm.State = model.VMStateRunning
|
vm.State = model.VMStateRunning
|
||||||
vm.Runtime.State = model.VMStateRunning
|
vm.Runtime.State = model.VMStateRunning
|
||||||
vm.Runtime.PID = firecracker.Process.Pid
|
|
||||||
vm.Runtime.APISockPath = apiSock
|
vm.Runtime.APISockPath = apiSock
|
||||||
|
|
||||||
// Both scripts return empty output (no changes).
|
// Both scripts return empty output (no changes).
|
||||||
|
|
@ -243,6 +242,7 @@ func TestExportVMWorkspace_NoChanges(t *testing.T) {
|
||||||
}
|
}
|
||||||
d := newExportTestDaemonStore(t, fake)
|
d := newExportTestDaemonStore(t, fake)
|
||||||
upsertDaemonVM(t, ctx, d.store, vm)
|
upsertDaemonVM(t, ctx, d.store, vm)
|
||||||
|
d.setVMHandlesInMemory(vm.ID, model.VMHandles{PID: firecracker.Process.Pid})
|
||||||
|
|
||||||
result, err := d.ExportVMWorkspace(ctx, api.WorkspaceExportParams{
|
result, err := d.ExportVMWorkspace(ctx, api.WorkspaceExportParams{
|
||||||
IDOrName: vm.Name,
|
IDOrName: vm.Name,
|
||||||
|
|
@ -271,7 +271,6 @@ func TestExportVMWorkspace_DefaultGuestPath(t *testing.T) {
|
||||||
vm := testVM("exportbox-default", "image-export", "172.16.0.102")
|
vm := testVM("exportbox-default", "image-export", "172.16.0.102")
|
||||||
vm.State = model.VMStateRunning
|
vm.State = model.VMStateRunning
|
||||||
vm.Runtime.State = model.VMStateRunning
|
vm.Runtime.State = model.VMStateRunning
|
||||||
vm.Runtime.PID = firecracker.Process.Pid
|
|
||||||
vm.Runtime.APISockPath = apiSock
|
vm.Runtime.APISockPath = apiSock
|
||||||
|
|
||||||
fake := &exportGuestClient{
|
fake := &exportGuestClient{
|
||||||
|
|
@ -282,6 +281,7 @@ func TestExportVMWorkspace_DefaultGuestPath(t *testing.T) {
|
||||||
}
|
}
|
||||||
d := newExportTestDaemonStore(t, fake)
|
d := newExportTestDaemonStore(t, fake)
|
||||||
upsertDaemonVM(t, ctx, d.store, vm)
|
upsertDaemonVM(t, ctx, d.store, vm)
|
||||||
|
d.setVMHandlesInMemory(vm.ID, model.VMHandles{PID: firecracker.Process.Pid})
|
||||||
|
|
||||||
// GuestPath omitted — should default to /root/repo.
|
// GuestPath omitted — should default to /root/repo.
|
||||||
result, err := d.ExportVMWorkspace(ctx, api.WorkspaceExportParams{
|
result, err := d.ExportVMWorkspace(ctx, api.WorkspaceExportParams{
|
||||||
|
|
@ -305,6 +305,7 @@ func TestExportVMWorkspace_VMNotRunning(t *testing.T) {
|
||||||
fake := &exportGuestClient{}
|
fake := &exportGuestClient{}
|
||||||
d := newExportTestDaemonStore(t, fake)
|
d := newExportTestDaemonStore(t, fake)
|
||||||
upsertDaemonVM(t, ctx, d.store, vm)
|
upsertDaemonVM(t, ctx, d.store, vm)
|
||||||
|
// VM is stopped — no handle seed; vmAlive must return false.
|
||||||
|
|
||||||
_, err := d.ExportVMWorkspace(ctx, api.WorkspaceExportParams{
|
_, err := d.ExportVMWorkspace(ctx, api.WorkspaceExportParams{
|
||||||
IDOrName: vm.Name,
|
IDOrName: vm.Name,
|
||||||
|
|
@ -327,7 +328,6 @@ func TestExportVMWorkspace_MultipleChangedFiles(t *testing.T) {
|
||||||
vm := testVM("exportbox-multi", "image-export", "172.16.0.104")
|
vm := testVM("exportbox-multi", "image-export", "172.16.0.104")
|
||||||
vm.State = model.VMStateRunning
|
vm.State = model.VMStateRunning
|
||||||
vm.Runtime.State = model.VMStateRunning
|
vm.Runtime.State = model.VMStateRunning
|
||||||
vm.Runtime.PID = firecracker.Process.Pid
|
|
||||||
vm.Runtime.APISockPath = apiSock
|
vm.Runtime.APISockPath = apiSock
|
||||||
|
|
||||||
patch := []byte("diff --git a/a.go b/a.go\n--- a/a.go\n+++ b/a.go\n")
|
patch := []byte("diff --git a/a.go b/a.go\n--- a/a.go\n+++ b/a.go\n")
|
||||||
|
|
@ -341,6 +341,7 @@ func TestExportVMWorkspace_MultipleChangedFiles(t *testing.T) {
|
||||||
}
|
}
|
||||||
d := newExportTestDaemonStore(t, fake)
|
d := newExportTestDaemonStore(t, fake)
|
||||||
upsertDaemonVM(t, ctx, d.store, vm)
|
upsertDaemonVM(t, ctx, d.store, vm)
|
||||||
|
d.setVMHandlesInMemory(vm.ID, model.VMHandles{PID: firecracker.Process.Pid})
|
||||||
|
|
||||||
result, err := d.ExportVMWorkspace(ctx, api.WorkspaceExportParams{
|
result, err := d.ExportVMWorkspace(ctx, api.WorkspaceExportParams{
|
||||||
IDOrName: vm.Name,
|
IDOrName: vm.Name,
|
||||||
|
|
@ -380,7 +381,6 @@ func TestPrepareVMWorkspace_ReleasesVMLockDuringGuestIO(t *testing.T) {
|
||||||
vm := testVM("lockbox", "image-x", "172.16.0.210")
|
vm := testVM("lockbox", "image-x", "172.16.0.210")
|
||||||
vm.State = model.VMStateRunning
|
vm.State = model.VMStateRunning
|
||||||
vm.Runtime.State = model.VMStateRunning
|
vm.Runtime.State = model.VMStateRunning
|
||||||
vm.Runtime.PID = firecracker.Process.Pid
|
|
||||||
vm.Runtime.APISockPath = apiSock
|
vm.Runtime.APISockPath = apiSock
|
||||||
|
|
||||||
d := &Daemon{
|
d := &Daemon{
|
||||||
|
|
@ -393,6 +393,7 @@ func TestPrepareVMWorkspace_ReleasesVMLockDuringGuestIO(t *testing.T) {
|
||||||
return &exportGuestClient{}, nil
|
return &exportGuestClient{}, nil
|
||||||
}
|
}
|
||||||
upsertDaemonVM(t, ctx, d.store, vm)
|
upsertDaemonVM(t, ctx, d.store, vm)
|
||||||
|
d.setVMHandlesInMemory(vm.ID, model.VMHandles{PID: firecracker.Process.Pid})
|
||||||
|
|
||||||
// Replace the seams. InspectRepo returns a trivial spec so the
|
// Replace the seams. InspectRepo returns a trivial spec so the
|
||||||
// real filesystem isn't touched; Import blocks until we say go.
|
// real filesystem isn't touched; Import blocks until we say go.
|
||||||
|
|
@ -473,7 +474,6 @@ func TestPrepareVMWorkspace_SerialisesConcurrentPreparesOnSameVM(t *testing.T) {
|
||||||
vm := testVM("serialbox", "image-x", "172.16.0.211")
|
vm := testVM("serialbox", "image-x", "172.16.0.211")
|
||||||
vm.State = model.VMStateRunning
|
vm.State = model.VMStateRunning
|
||||||
vm.Runtime.State = model.VMStateRunning
|
vm.Runtime.State = model.VMStateRunning
|
||||||
vm.Runtime.PID = firecracker.Process.Pid
|
|
||||||
vm.Runtime.APISockPath = apiSock
|
vm.Runtime.APISockPath = apiSock
|
||||||
|
|
||||||
d := &Daemon{
|
d := &Daemon{
|
||||||
|
|
@ -486,6 +486,7 @@ func TestPrepareVMWorkspace_SerialisesConcurrentPreparesOnSameVM(t *testing.T) {
|
||||||
return &exportGuestClient{}, nil
|
return &exportGuestClient{}, nil
|
||||||
}
|
}
|
||||||
upsertDaemonVM(t, ctx, d.store, vm)
|
upsertDaemonVM(t, ctx, d.store, vm)
|
||||||
|
d.setVMHandlesInMemory(vm.ID, model.VMHandles{PID: firecracker.Process.Pid})
|
||||||
|
|
||||||
origInspect := workspaceInspectRepoFunc
|
origInspect := workspaceInspectRepoFunc
|
||||||
origImport := workspaceImportFunc
|
origImport := workspaceImportFunc
|
||||||
|
|
@ -569,7 +570,6 @@ func TestExportVMWorkspace_DoesNotMutateRealIndex(t *testing.T) {
|
||||||
vm := testVM("exportbox-readonly", "image-export", "172.16.0.107")
|
vm := testVM("exportbox-readonly", "image-export", "172.16.0.107")
|
||||||
vm.State = model.VMStateRunning
|
vm.State = model.VMStateRunning
|
||||||
vm.Runtime.State = model.VMStateRunning
|
vm.Runtime.State = model.VMStateRunning
|
||||||
vm.Runtime.PID = firecracker.Process.Pid
|
|
||||||
vm.Runtime.APISockPath = apiSock
|
vm.Runtime.APISockPath = apiSock
|
||||||
|
|
||||||
fake := &exportGuestClient{
|
fake := &exportGuestClient{
|
||||||
|
|
@ -580,6 +580,7 @@ func TestExportVMWorkspace_DoesNotMutateRealIndex(t *testing.T) {
|
||||||
}
|
}
|
||||||
d := newExportTestDaemonStore(t, fake)
|
d := newExportTestDaemonStore(t, fake)
|
||||||
upsertDaemonVM(t, ctx, d.store, vm)
|
upsertDaemonVM(t, ctx, d.store, vm)
|
||||||
|
d.setVMHandlesInMemory(vm.ID, model.VMHandles{PID: firecracker.Process.Pid})
|
||||||
|
|
||||||
if _, err := d.ExportVMWorkspace(ctx, api.WorkspaceExportParams{IDOrName: vm.Name}); err != nil {
|
if _, err := d.ExportVMWorkspace(ctx, api.WorkspaceExportParams{IDOrName: vm.Name}); err != nil {
|
||||||
t.Fatalf("ExportVMWorkspace: %v", err)
|
t.Fatalf("ExportVMWorkspace: %v", err)
|
||||||
|
|
|
||||||
|
|
@ -107,11 +107,22 @@ type VMSpec struct {
|
||||||
NATEnabled bool `json:"nat_enabled"`
|
NATEnabled bool `json:"nat_enabled"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// VMRuntime holds the durable runtime state that the daemon needs
|
||||||
|
// to reach a VM: identity, declared state, and deterministic derived
|
||||||
|
// paths. Transient kernel/process handles (PID, tap, loop devices,
|
||||||
|
// dm-snapshot names) live on VMHandles, NOT here — the daemon keeps
|
||||||
|
// them in an in-memory cache backed by a per-VM handles.json scratch
|
||||||
|
// file, so a daemon restart rebuilds them from OS state rather than
|
||||||
|
// trusting whatever was last written into a SQLite column.
|
||||||
|
//
|
||||||
|
// Everything in VMRuntime is safe to persist: the paths are
|
||||||
|
// deterministic from (VM ID, layout) and survive restart unchanged;
|
||||||
|
// GuestIP and DNSName are assigned at create time and never move;
|
||||||
|
// LastError carries the last failure message for debugging. State
|
||||||
|
// mirrors VMRecord.State.
|
||||||
type VMRuntime struct {
|
type VMRuntime struct {
|
||||||
State VMState `json:"state"`
|
State VMState `json:"state"`
|
||||||
PID int `json:"pid,omitempty"`
|
|
||||||
GuestIP string `json:"guest_ip"`
|
GuestIP string `json:"guest_ip"`
|
||||||
TapDevice string `json:"tap_device,omitempty"`
|
|
||||||
APISockPath string `json:"api_sock_path,omitempty"`
|
APISockPath string `json:"api_sock_path,omitempty"`
|
||||||
VSockPath string `json:"vsock_path,omitempty"`
|
VSockPath string `json:"vsock_path,omitempty"`
|
||||||
VSockCID uint32 `json:"vsock_cid,omitempty"`
|
VSockCID uint32 `json:"vsock_cid,omitempty"`
|
||||||
|
|
@ -121,10 +132,6 @@ type VMRuntime struct {
|
||||||
VMDir string `json:"vm_dir"`
|
VMDir string `json:"vm_dir"`
|
||||||
SystemOverlay string `json:"system_overlay_path"`
|
SystemOverlay string `json:"system_overlay_path"`
|
||||||
WorkDiskPath string `json:"work_disk_path"`
|
WorkDiskPath string `json:"work_disk_path"`
|
||||||
BaseLoop string `json:"base_loop,omitempty"`
|
|
||||||
COWLoop string `json:"cow_loop,omitempty"`
|
|
||||||
DMName string `json:"dm_name,omitempty"`
|
|
||||||
DMDev string `json:"dm_dev,omitempty"`
|
|
||||||
LastError string `json:"last_error,omitempty"`
|
LastError string `json:"last_error,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
51
internal/model/vm_handles.go
Normal file
51
internal/model/vm_handles.go
Normal file
|
|
@ -0,0 +1,51 @@
|
||||||
|
package model
|
||||||
|
|
||||||
|
// VMHandles captures the transient, per-boot kernel/process handles
|
||||||
|
// that banger obtains while starting a VM and releases when stopping
|
||||||
|
// it. Unlike VMRuntime (durable spec + identity + derived paths),
|
||||||
|
// nothing in VMHandles survives a daemon restart in authoritative
|
||||||
|
// form: each value is either rediscovered from the OS (PID from the
|
||||||
|
// firecracker api socket, DM name deterministically from the VM ID)
|
||||||
|
// or read from a per-VM scratch file that the daemon rebuilds at
|
||||||
|
// every start.
|
||||||
|
//
|
||||||
|
// The daemon keeps an in-memory cache keyed by VM ID. Lifecycle
|
||||||
|
// transitions update the cache and a small `handles.json` scratch
|
||||||
|
// file in the VM's state directory; daemon startup reconciles
|
||||||
|
// by loading that file and verifying each handle against the live
|
||||||
|
// OS state. If anything is stale the VM is marked stopped and the
|
||||||
|
// cache entry is dropped.
|
||||||
|
//
|
||||||
|
// VMHandles never appears in the `vms` SQLite rows. Keeping it off
|
||||||
|
// the durable schema was the whole point of the split — persistent
|
||||||
|
// records describe what a VM SHOULD be; handles describe what is
|
||||||
|
// currently true about it.
|
||||||
|
type VMHandles struct {
|
||||||
|
// PID is the firecracker process PID. Zero means "not running
|
||||||
|
// (from our perspective)". Always verifiable via
|
||||||
|
// /proc/<pid>/cmdline matching the api socket path.
|
||||||
|
PID int `json:"pid,omitempty"`
|
||||||
|
|
||||||
|
// TapDevice is the kernel tap interface name (e.g. "tap-fc-0001")
|
||||||
|
// bound to the VM's virtio-net. Released on stop.
|
||||||
|
TapDevice string `json:"tap_device,omitempty"`
|
||||||
|
|
||||||
|
// BaseLoop and COWLoop are the two loop devices backing the
|
||||||
|
// dm-snapshot layer (read-only base = rootfs; read-write overlay
|
||||||
|
// = per-VM COW file). Released via losetup -d on stop.
|
||||||
|
BaseLoop string `json:"base_loop,omitempty"`
|
||||||
|
COWLoop string `json:"cow_loop,omitempty"`
|
||||||
|
|
||||||
|
// DMName is the device-mapper target name; deterministic from the
|
||||||
|
// VM ID (see dmsnap.SnapshotName). DMDev is the corresponding
|
||||||
|
// /dev/mapper/<name> path. Torn down by `dmsetup remove` on stop.
|
||||||
|
DMName string `json:"dm_name,omitempty"`
|
||||||
|
DMDev string `json:"dm_dev,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// IsZero reports whether every handle field is unset. Useful as a
|
||||||
|
// cheap "this VM has no kernel/process resources held on our behalf"
|
||||||
|
// check.
|
||||||
|
func (h VMHandles) IsZero() bool {
|
||||||
|
return h.PID == 0 && h.TapDevice == "" && h.BaseLoop == "" && h.COWLoop == "" && h.DMName == "" && h.DMDev == ""
|
||||||
|
}
|
||||||
|
|
@ -372,7 +372,6 @@ func sampleVM(name, imageID, guestIP string) model.VMRecord {
|
||||||
Runtime: model.VMRuntime{
|
Runtime: model.VMRuntime{
|
||||||
State: model.VMStateStopped,
|
State: model.VMStateStopped,
|
||||||
GuestIP: guestIP,
|
GuestIP: guestIP,
|
||||||
TapDevice: "tap-" + name,
|
|
||||||
APISockPath: "/tmp/" + name + ".sock",
|
APISockPath: "/tmp/" + name + ".sock",
|
||||||
LogPath: "/tmp/" + name + ".log",
|
LogPath: "/tmp/" + name + ".log",
|
||||||
MetricsPath: "/tmp/" + name + ".metrics",
|
MetricsPath: "/tmp/" + name + ".metrics",
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue