Cleanup identity for kernel objects was split across two sources of
truth: vm.Runtime (DB-backed, durable) held paths and the guest IP,
but the TAP name lived only in the in-process handle cache + the
best-effort handles.json scratch file next to the VM dir. Every
other cleanup-identifying datum has a fallback — firecracker PID
can be rediscovered via `pgrep -f <apiSock>`, loops via losetup, dm
name from the deterministic ShortID(vm.ID). The tap is the one
truly cache-only datum (allocated from a pool, not derivable).
That made NAT teardown fragile:
- daemon crash between `acquireTap` and the handles.json write
- handles.json corrupt on the next daemon start
- partial cleanup that already zeroed the cache
In any of those cases natCapability.Cleanup short-circuited
("skipping nat cleanup without runtime network handles") and the
per-VM POSTROUTING MASQUERADE + the two FORWARD rules keyed off
the tap would leak. The VM row in the DB still existed, so a retry
couldn't close the loop — the tap name was simply gone.
Fix: mirror TapDevice onto model.VMRuntime (serialised via the
existing runtime_json column, omitempty so existing rows upgrade
cleanly). Set it in startVMLocked right next to the
s.setVMHandles call that seeds the in-memory cache; clear it at
every post-cleanup reset site (stop normal path + stop stale
branch, kill normal path + kill stale branch, cleanupOnErr in
start, reconcile's stale-vm branch, the stats poller's auto-stop
path).
Fallbacks now cascade:
- natCapability.Cleanup: handles cache → Runtime.TapDevice
- cleanupRuntime (releaseTap): handles cache → Runtime.TapDevice
Both surfaces refuse gracefully (old behaviour) only when neither
source has a value, which really does mean "no tap was ever
allocated for this VM" rather than "we lost track of it."
Test: TestNATCapabilityCleanup_FallsBackToRuntimeTapDevice clears
the handle cache, sets vm.Runtime.TapDevice, and asserts Cleanup
reaches the runner — the exact scenario the review flagged as a
plausible leak and the exact code path that now guarantees it
doesn't.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
157 lines
4.3 KiB
Go
157 lines
4.3 KiB
Go
package daemon
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"strings"
|
|
"time"
|
|
|
|
"banger/internal/api"
|
|
"banger/internal/model"
|
|
"banger/internal/system"
|
|
"banger/internal/vsockagent"
|
|
)
|
|
|
|
func (s *VMService) GetVMStats(ctx context.Context, idOrName string) (model.VMRecord, model.VMStats, error) {
|
|
vm, err := s.withVMLockByRef(ctx, idOrName, func(vm model.VMRecord) (model.VMRecord, error) {
|
|
return s.getVMStatsLocked(ctx, vm)
|
|
})
|
|
if err != nil {
|
|
return model.VMRecord{}, model.VMStats{}, err
|
|
}
|
|
return vm, vm.Stats, nil
|
|
}
|
|
|
|
func (s *VMService) HealthVM(ctx context.Context, idOrName string) (result api.VMHealthResult, err error) {
|
|
_, err = s.withVMLockByRef(ctx, idOrName, func(vm model.VMRecord) (model.VMRecord, error) {
|
|
result.Name = vm.Name
|
|
if !s.vmAlive(vm) {
|
|
result.Healthy = false
|
|
return vm, nil
|
|
}
|
|
if strings.TrimSpace(vm.Runtime.VSockPath) == "" {
|
|
return model.VMRecord{}, errors.New("vm has no vsock path")
|
|
}
|
|
if vm.Runtime.VSockCID == 0 {
|
|
return model.VMRecord{}, errors.New("vm has no vsock cid")
|
|
}
|
|
if err := s.net.ensureSocketAccess(ctx, vm.Runtime.VSockPath, "firecracker vsock socket"); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
pingCtx, cancel := context.WithTimeout(ctx, 3*time.Second)
|
|
defer cancel()
|
|
if err := vsockagent.Health(pingCtx, s.logger, vm.Runtime.VSockPath); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
result.Healthy = true
|
|
return vm, nil
|
|
})
|
|
return result, err
|
|
}
|
|
|
|
func (s *VMService) PingVM(ctx context.Context, idOrName string) (result api.VMPingResult, err error) {
|
|
health, err := s.HealthVM(ctx, idOrName)
|
|
if err != nil {
|
|
return api.VMPingResult{}, err
|
|
}
|
|
return api.VMPingResult{Name: health.Name, Alive: health.Healthy}, nil
|
|
}
|
|
|
|
func (s *VMService) getVMStatsLocked(ctx context.Context, vm model.VMRecord) (model.VMRecord, error) {
|
|
stats, err := s.collectStats(ctx, vm)
|
|
if err == nil {
|
|
vm.Stats = stats
|
|
vm.UpdatedAt = model.Now()
|
|
_ = s.store.UpsertVM(ctx, vm)
|
|
if s.logger != nil {
|
|
s.logger.Debug("vm stats collected", append(vmLogAttrs(vm), "rss_bytes", stats.RSSBytes, "vsz_bytes", stats.VSZBytes, "cpu_percent", stats.CPUPercent)...)
|
|
}
|
|
}
|
|
return vm, nil
|
|
}
|
|
|
|
func (s *VMService) pollStats(ctx context.Context) error {
|
|
vms, err := s.store.ListVMs(ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
for _, vm := range vms {
|
|
if err := s.withVMLockByIDErr(ctx, vm.ID, func(vm model.VMRecord) error {
|
|
if !s.vmAlive(vm) {
|
|
return nil
|
|
}
|
|
stats, err := s.collectStats(ctx, vm)
|
|
if err != nil {
|
|
if s.logger != nil {
|
|
s.logger.Debug("vm stats collection failed", append(vmLogAttrs(vm), "error", err.Error())...)
|
|
}
|
|
return nil
|
|
}
|
|
vm.Stats = stats
|
|
vm.UpdatedAt = model.Now()
|
|
return s.store.UpsertVM(ctx, vm)
|
|
}); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (s *VMService) stopStaleVMs(ctx context.Context) (err error) {
|
|
if s.config.AutoStopStaleAfter <= 0 {
|
|
return nil
|
|
}
|
|
op := s.beginOperation("vm.stop_stale")
|
|
defer func() {
|
|
if err != nil {
|
|
op.fail(err)
|
|
return
|
|
}
|
|
op.done()
|
|
}()
|
|
vms, err := s.store.ListVMs(ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
now := model.Now()
|
|
for _, vm := range vms {
|
|
if err := s.withVMLockByIDErr(ctx, vm.ID, func(vm model.VMRecord) error {
|
|
if !s.vmAlive(vm) {
|
|
return nil
|
|
}
|
|
if now.Sub(vm.LastTouchedAt) < s.config.AutoStopStaleAfter {
|
|
return nil
|
|
}
|
|
op.stage("stopping_vm", vmLogAttrs(vm)...)
|
|
_ = s.net.sendCtrlAltDel(ctx, vm.Runtime.APISockPath)
|
|
_ = s.net.waitForExit(ctx, s.vmHandles(vm.ID).PID, vm.Runtime.APISockPath, 10*time.Second)
|
|
_ = s.cleanupRuntime(ctx, vm, true)
|
|
vm.State = model.VMStateStopped
|
|
vm.Runtime.State = model.VMStateStopped
|
|
vm.Runtime.TapDevice = ""
|
|
s.clearVMHandles(vm)
|
|
vm.UpdatedAt = model.Now()
|
|
return s.store.UpsertVM(ctx, vm)
|
|
}); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (s *VMService) collectStats(ctx context.Context, vm model.VMRecord) (model.VMStats, error) {
|
|
stats := model.VMStats{
|
|
CollectedAt: model.Now(),
|
|
SystemOverlayBytes: system.AllocatedBytes(vm.Runtime.SystemOverlay),
|
|
WorkDiskBytes: system.AllocatedBytes(vm.Runtime.WorkDiskPath),
|
|
MetricsRaw: system.ParseMetricsFile(vm.Runtime.MetricsPath),
|
|
}
|
|
if s.vmAlive(vm) {
|
|
if ps, err := system.ReadProcessStats(ctx, s.vmHandles(vm.ID).PID); err == nil {
|
|
stats.CPUPercent = ps.CPUPercent
|
|
stats.RSSBytes = ps.RSSBytes
|
|
stats.VSZBytes = ps.VSZBytes
|
|
}
|
|
}
|
|
return stats, nil
|
|
}
|