banger/internal/daemon/vm_handles.go

package daemon

import (
	"context"
	"encoding/json"
	"errors"
	"fmt"
	"os"
	"path/filepath"
	"sync"

	"banger/internal/model"
)

// handleCache is the daemon's in-memory map of per-VM transient
// handles. It is the sole runtime source of truth for PID / tap /
// loop / DM state — persistent storage (the per-VM handles.json
// scratch file) exists only so the daemon can rebuild the cache
// after a restart.
type handleCache struct {
	mu sync.RWMutex
	m  map[string]model.VMHandles
}

func newHandleCache() *handleCache {
	return &handleCache{m: make(map[string]model.VMHandles)}
}

// get returns the cached handles for vmID and whether an entry
// exists. A missing entry means "no live handles tracked," which is
// the correct state for stopped VMs.
func (c *handleCache) get(vmID string) (model.VMHandles, bool) {
	c.mu.RLock()
	defer c.mu.RUnlock()
	h, ok := c.m[vmID]
	return h, ok
}

func (c *handleCache) set(vmID string, h model.VMHandles) {
	c.mu.Lock()
	defer c.mu.Unlock()
	c.m[vmID] = h
}

func (c *handleCache) clear(vmID string) {
	c.mu.Lock()
	defer c.mu.Unlock()
	delete(c.m, vmID)
}

// handlesFilePath returns the scratch file path inside the VM
// directory where the daemon writes the last-known handles.
func handlesFilePath(vmDir string) string {
	return filepath.Join(vmDir, "handles.json")
}

// writeHandlesFile persists h to <vmDir>/handles.json. Called
// whenever the daemon successfully transitions a VM to running
// (after all handles are acquired). Best-effort: a write failure is
// logged, not propagated — the in-memory cache is authoritative
// while the daemon is up.
func writeHandlesFile(vmDir string, h model.VMHandles) error {
	if vmDir == "" {
		return errors.New("vm dir is required")
	}
	if err := os.MkdirAll(vmDir, 0o755); err != nil {
		return err
	}
	data, err := json.MarshalIndent(h, "", "  ")
	if err != nil {
		return err
	}
	return os.WriteFile(handlesFilePath(vmDir), data, 0o600)
}

// readHandlesFile loads the scratch file written at the last start.
// Returns a zero-value handles + (false, nil) if the file doesn't
// exist — that's the normal case for stopped VMs.
func readHandlesFile(vmDir string) (model.VMHandles, bool, error) {
	if vmDir == "" {
		return model.VMHandles{}, false, nil
	}
	data, err := os.ReadFile(handlesFilePath(vmDir))
	if os.IsNotExist(err) {
		return model.VMHandles{}, false, nil
	}
	if err != nil {
		return model.VMHandles{}, false, err
	}
	var h model.VMHandles
	if err := json.Unmarshal(data, &h); err != nil {
		return model.VMHandles{}, false, fmt.Errorf("parse handles.json: %w", err)
	}
	return h, true, nil
}

func removeHandlesFile(vmDir string) {
	if vmDir == "" {
		return
	}
	_ = os.Remove(handlesFilePath(vmDir))
}

// ensureHandleCache lazily constructs the cache so direct
// `&Daemon{}` literals (common in tests) don't have to initialise
// it. Production code goes through Open(), which also builds it.
func (s *VMService) ensureHandleCache() {
	if s.handles == nil {
		s.handles = newHandleCache()
	}
}

// setVMHandlesInMemory is a test-only cache seed that skips the
// scratch-file write. Production callers should use setVMHandles so
// the filesystem survives a daemon restart.
func (s *VMService) setVMHandlesInMemory(vmID string, h model.VMHandles) {
	if s == nil {
		return
	}
	s.ensureHandleCache()
	s.handles.set(vmID, h)
}

// vmHandles returns the cached handles for vm (zero-value if no
// entry). The in-process handle cache is the authoritative source
// for PID and live kernel/network handles; VMRecord.Runtime only
// mirrors teardown-critical fields for restart recovery.
func (s *VMService) vmHandles(vmID string) model.VMHandles {
	if s == nil {
		return model.VMHandles{}
	}
	s.ensureHandleCache()
	h, _ := s.handles.get(vmID)
	return h
}

// setVMHandles updates the in-memory cache, mirrors teardown-critical
// fields onto VMRuntime, and writes the per-VM scratch file.
// Scratch-file errors are logged but not returned; the cache remains
// authoritative while the daemon is alive.
//
// Write order: file first, cache second. A daemon crash between the
// two leaves the on-disk scratch file ahead of the in-memory cache —
// which is the recoverable direction, since reconcile re-seeds the
// cache from the file on the next start. The reverse order would let
// a crash strand handles the daemon already saw as live but never
// persisted, breaking the next-start teardown of DM/loops/tap.
func (s *VMService) setVMHandles(vm *model.VMRecord, h model.VMHandles) {
	if s == nil || vm == nil {
		return
	}
	persistRuntimeTeardownState(vm, h)
	s.ensureHandleCache()
	if err := writeHandlesFile(vm.Runtime.VMDir, h); err != nil && s.logger != nil {
		s.logger.Warn("persist handles.json failed", "vm_id", vm.ID, "error", err.Error())
	}
	s.handles.set(vm.ID, h)
}

// clearVMHandles drops the cache entry and removes the scratch
// file. Called on stop / delete / after a failed start.
func (s *VMService) clearVMHandles(vm model.VMRecord) {
	if s == nil {
		return
	}
	s.ensureHandleCache()
	s.handles.clear(vm.ID)
	removeHandlesFile(vm.Runtime.VMDir)
}

// vmAlive is the canonical "is this VM actually running?" check.
// Unlike the old `system.ProcessRunning(vm.Runtime.PID, apiSock)`
// pattern, this reads the PID from the handle cache — which is
// authoritative in-process — and verifies the PID against the api
// socket so a recycled PID can't false-positive.
func (s *VMService) vmAlive(vm model.VMRecord) bool {
	if vm.State != model.VMStateRunning {
		return false
	}
	h := s.vmHandles(vm.ID)
	if h.PID <= 0 {
		return false
	}
	running, err := s.privOps().ProcessRunning(context.Background(), h.PID, vm.Runtime.APISockPath)
	return err == nil && running
}

// rediscoverHandles loads what the last daemon start knew about a VM
// from its handles.json scratch file and verifies the firecracker
// process is still alive. Returns:
//
//   - handles: the scratch-file contents (zero-value if no file).
//     ALWAYS returned, even when alive=false, because the caller
//     needs them to tear down kernel state (dm-snapshot, loops, tap)
//     that the previous daemon left behind when it died.
//   - alive: true iff a firecracker process matching the api sock is
//     currently running.
//   - err: unexpected failure (file exists but is corrupt).
//
// Strategy: pgrep by api sock path first (handles the case where
// the daemon crashed but the PID changed on respawn — unlikely for
// firecracker, but cheap insurance); fall back to verifying the
// scratch file's PID directly.
func (s *VMService) rediscoverHandles(ctx context.Context, vm model.VMRecord) (model.VMHandles, bool, error) {
	saved, _, err := readHandlesFile(vm.Runtime.VMDir)
	if err != nil {
		return model.VMHandles{}, false, err
	}
	apiSock := vm.Runtime.APISockPath
	if apiSock == "" {
		return saved, false, nil
	}
	if pid, pidErr := s.net.findFirecrackerPID(ctx, apiSock); pidErr == nil && pid > 0 {
		saved.PID = pid
		return saved, true, nil
	}
	if saved.PID > 0 {
		if running, runErr := s.privOps().ProcessRunning(ctx, saved.PID, apiSock); runErr == nil && running {
			return saved, true, nil
		}
	}
	return saved, false, nil
}