package daemon import ( "context" "encoding/json" "errors" "fmt" "os" "path/filepath" "sync" "banger/internal/model" ) // handleCache is the daemon's in-memory map of per-VM transient // handles. It is the sole runtime source of truth for PID / tap / // loop / DM state — persistent storage (the per-VM handles.json // scratch file) exists only so the daemon can rebuild the cache // after a restart. type handleCache struct { mu sync.RWMutex m map[string]model.VMHandles } func newHandleCache() *handleCache { return &handleCache{m: make(map[string]model.VMHandles)} } // get returns the cached handles for vmID and whether an entry // exists. A missing entry means "no live handles tracked," which is // the correct state for stopped VMs. func (c *handleCache) get(vmID string) (model.VMHandles, bool) { c.mu.RLock() defer c.mu.RUnlock() h, ok := c.m[vmID] return h, ok } func (c *handleCache) set(vmID string, h model.VMHandles) { c.mu.Lock() defer c.mu.Unlock() c.m[vmID] = h } func (c *handleCache) clear(vmID string) { c.mu.Lock() defer c.mu.Unlock() delete(c.m, vmID) } // handlesFilePath returns the scratch file path inside the VM // directory where the daemon writes the last-known handles. func handlesFilePath(vmDir string) string { return filepath.Join(vmDir, "handles.json") } // writeHandlesFile persists h to /handles.json. Called // whenever the daemon successfully transitions a VM to running // (after all handles are acquired). Best-effort: a write failure is // logged, not propagated — the in-memory cache is authoritative // while the daemon is up. func writeHandlesFile(vmDir string, h model.VMHandles) error { if vmDir == "" { return errors.New("vm dir is required") } if err := os.MkdirAll(vmDir, 0o755); err != nil { return err } data, err := json.MarshalIndent(h, "", " ") if err != nil { return err } return os.WriteFile(handlesFilePath(vmDir), data, 0o600) } // readHandlesFile loads the scratch file written at the last start. // Returns a zero-value handles + (false, nil) if the file doesn't // exist — that's the normal case for stopped VMs. func readHandlesFile(vmDir string) (model.VMHandles, bool, error) { if vmDir == "" { return model.VMHandles{}, false, nil } data, err := os.ReadFile(handlesFilePath(vmDir)) if os.IsNotExist(err) { return model.VMHandles{}, false, nil } if err != nil { return model.VMHandles{}, false, err } var h model.VMHandles if err := json.Unmarshal(data, &h); err != nil { return model.VMHandles{}, false, fmt.Errorf("parse handles.json: %w", err) } return h, true, nil } func removeHandlesFile(vmDir string) { if vmDir == "" { return } _ = os.Remove(handlesFilePath(vmDir)) } // ensureHandleCache lazily constructs the cache so direct // `&Daemon{}` literals (common in tests) don't have to initialise // it. Production code goes through Open(), which also builds it. func (s *VMService) ensureHandleCache() { if s.handles == nil { s.handles = newHandleCache() } } // setVMHandlesInMemory is a test-only cache seed that skips the // scratch-file write. Production callers should use setVMHandles so // the filesystem survives a daemon restart. func (s *VMService) setVMHandlesInMemory(vmID string, h model.VMHandles) { if s == nil { return } s.ensureHandleCache() s.handles.set(vmID, h) } // vmHandles returns the cached handles for vm (zero-value if no // entry). The in-process handle cache is the authoritative source // for PID and live kernel/network handles; VMRecord.Runtime only // mirrors teardown-critical fields for restart recovery. func (s *VMService) vmHandles(vmID string) model.VMHandles { if s == nil { return model.VMHandles{} } s.ensureHandleCache() h, _ := s.handles.get(vmID) return h } // setVMHandles updates the in-memory cache, mirrors teardown-critical // fields onto VMRuntime, and writes the per-VM scratch file. // Scratch-file errors are logged but not returned; the cache remains // authoritative while the daemon is alive. // // Write order: file first, cache second. A daemon crash between the // two leaves the on-disk scratch file ahead of the in-memory cache — // which is the recoverable direction, since reconcile re-seeds the // cache from the file on the next start. The reverse order would let // a crash strand handles the daemon already saw as live but never // persisted, breaking the next-start teardown of DM/loops/tap. func (s *VMService) setVMHandles(vm *model.VMRecord, h model.VMHandles) { if s == nil || vm == nil { return } persistRuntimeTeardownState(vm, h) s.ensureHandleCache() if err := writeHandlesFile(vm.Runtime.VMDir, h); err != nil && s.logger != nil { s.logger.Warn("persist handles.json failed", "vm_id", vm.ID, "error", err.Error()) } s.handles.set(vm.ID, h) } // clearVMHandles drops the cache entry and removes the scratch // file. Called on stop / delete / after a failed start. func (s *VMService) clearVMHandles(vm model.VMRecord) { if s == nil { return } s.ensureHandleCache() s.handles.clear(vm.ID) removeHandlesFile(vm.Runtime.VMDir) } // vmAlive is the canonical "is this VM actually running?" check. // Unlike the old `system.ProcessRunning(vm.Runtime.PID, apiSock)` // pattern, this reads the PID from the handle cache — which is // authoritative in-process — and verifies the PID against the api // socket so a recycled PID can't false-positive. func (s *VMService) vmAlive(vm model.VMRecord) bool { if vm.State != model.VMStateRunning { return false } h := s.vmHandles(vm.ID) if h.PID <= 0 { return false } running, err := s.privOps().ProcessRunning(context.Background(), h.PID, vm.Runtime.APISockPath) return err == nil && running } // rediscoverHandles loads what the last daemon start knew about a VM // from its handles.json scratch file and verifies the firecracker // process is still alive. Returns: // // - handles: the scratch-file contents (zero-value if no file). // ALWAYS returned, even when alive=false, because the caller // needs them to tear down kernel state (dm-snapshot, loops, tap) // that the previous daemon left behind when it died. // - alive: true iff a firecracker process matching the api sock is // currently running. // - err: unexpected failure (file exists but is corrupt). // // Strategy: pgrep by api sock path first (handles the case where // the daemon crashed but the PID changed on respawn — unlikely for // firecracker, but cheap insurance); fall back to verifying the // scratch file's PID directly. func (s *VMService) rediscoverHandles(ctx context.Context, vm model.VMRecord) (model.VMHandles, bool, error) { saved, _, err := readHandlesFile(vm.Runtime.VMDir) if err != nil { return model.VMHandles{}, false, err } apiSock := vm.Runtime.APISockPath if apiSock == "" { return saved, false, nil } if pid, pidErr := s.net.findFirecrackerPID(ctx, apiSock); pidErr == nil && pid > 0 { saved.PID = pid return saved, true, nil } if saved.PID > 0 { if running, runErr := s.privOps().ProcessRunning(ctx, saved.PID, apiSock); runErr == nil && running { return saved, true, nil } } return saved, false, nil }