banger/internal/daemon/vm_handles.go
Thales Maciel d743a8ba4b
daemon: persist teardown fallbacks and reject unsafe import paths
Preserve cleanup after daemon restarts and harden OCI and tar imports
against filenames that debugfs cannot encode safely.

Mirror tap, loop, and dm teardown identity onto VM.Runtime, teach
cleanup and reconcile to fall back to those persisted fields when
handles.json is missing or corrupt, and clear the recovery state on
stop, error, and delete paths.

Reject debugfs-hostile entry names during flattening and in
ApplyOwnership itself, then add regression coverage for corrupt
handles.json recovery and unsafe import paths.

Verified with targeted go tests, make lint-go, make lint-shell, and
make build.
2026-04-23 16:21:59 -03:00

214 lines
6.5 KiB
Go

package daemon
import (
"context"
"encoding/json"
"errors"
"fmt"
"os"
"path/filepath"
"sync"
"banger/internal/model"
"banger/internal/system"
)
// handleCache is the daemon's in-memory map of per-VM transient
// handles. It is the sole runtime source of truth for PID / tap /
// loop / DM state — persistent storage (the per-VM handles.json
// scratch file) exists only so the daemon can rebuild the cache
// after a restart.
type handleCache struct {
mu sync.RWMutex
m map[string]model.VMHandles
}
func newHandleCache() *handleCache {
return &handleCache{m: make(map[string]model.VMHandles)}
}
// get returns the cached handles for vmID and whether an entry
// exists. A missing entry means "no live handles tracked," which is
// the correct state for stopped VMs.
func (c *handleCache) get(vmID string) (model.VMHandles, bool) {
c.mu.RLock()
defer c.mu.RUnlock()
h, ok := c.m[vmID]
return h, ok
}
func (c *handleCache) set(vmID string, h model.VMHandles) {
c.mu.Lock()
defer c.mu.Unlock()
c.m[vmID] = h
}
func (c *handleCache) clear(vmID string) {
c.mu.Lock()
defer c.mu.Unlock()
delete(c.m, vmID)
}
// handlesFilePath returns the scratch file path inside the VM
// directory where the daemon writes the last-known handles.
func handlesFilePath(vmDir string) string {
return filepath.Join(vmDir, "handles.json")
}
// writeHandlesFile persists h to <vmDir>/handles.json. Called
// whenever the daemon successfully transitions a VM to running
// (after all handles are acquired). Best-effort: a write failure is
// logged, not propagated — the in-memory cache is authoritative
// while the daemon is up.
func writeHandlesFile(vmDir string, h model.VMHandles) error {
if vmDir == "" {
return errors.New("vm dir is required")
}
if err := os.MkdirAll(vmDir, 0o755); err != nil {
return err
}
data, err := json.MarshalIndent(h, "", " ")
if err != nil {
return err
}
return os.WriteFile(handlesFilePath(vmDir), data, 0o600)
}
// readHandlesFile loads the scratch file written at the last start.
// Returns a zero-value handles + (false, nil) if the file doesn't
// exist — that's the normal case for stopped VMs.
func readHandlesFile(vmDir string) (model.VMHandles, bool, error) {
if vmDir == "" {
return model.VMHandles{}, false, nil
}
data, err := os.ReadFile(handlesFilePath(vmDir))
if os.IsNotExist(err) {
return model.VMHandles{}, false, nil
}
if err != nil {
return model.VMHandles{}, false, err
}
var h model.VMHandles
if err := json.Unmarshal(data, &h); err != nil {
return model.VMHandles{}, false, fmt.Errorf("parse handles.json: %w", err)
}
return h, true, nil
}
func removeHandlesFile(vmDir string) {
if vmDir == "" {
return
}
_ = os.Remove(handlesFilePath(vmDir))
}
// ensureHandleCache lazily constructs the cache so direct
// `&Daemon{}` literals (common in tests) don't have to initialise
// it. Production code goes through Open(), which also builds it.
func (s *VMService) ensureHandleCache() {
if s.handles == nil {
s.handles = newHandleCache()
}
}
// setVMHandlesInMemory is a test-only cache seed that skips the
// scratch-file write. Production callers should use setVMHandles so
// the filesystem survives a daemon restart.
func (s *VMService) setVMHandlesInMemory(vmID string, h model.VMHandles) {
if s == nil {
return
}
s.ensureHandleCache()
s.handles.set(vmID, h)
}
// vmHandles returns the cached handles for vm (zero-value if no
// entry). The in-process handle cache is the authoritative source
// for PID and live kernel/network handles; VMRecord.Runtime only
// mirrors teardown-critical fields for restart recovery.
func (s *VMService) vmHandles(vmID string) model.VMHandles {
if s == nil {
return model.VMHandles{}
}
s.ensureHandleCache()
h, _ := s.handles.get(vmID)
return h
}
// setVMHandles updates the in-memory cache, mirrors teardown-critical
// fields onto VMRuntime, and writes the per-VM scratch file.
// Scratch-file errors are logged but not returned; the cache remains
// authoritative while the daemon is alive.
func (s *VMService) setVMHandles(vm *model.VMRecord, h model.VMHandles) {
if s == nil || vm == nil {
return
}
persistRuntimeTeardownState(vm, h)
s.ensureHandleCache()
s.handles.set(vm.ID, h)
if err := writeHandlesFile(vm.Runtime.VMDir, h); err != nil && s.logger != nil {
s.logger.Warn("persist handles.json failed", "vm_id", vm.ID, "error", err.Error())
}
}
// clearVMHandles drops the cache entry and removes the scratch
// file. Called on stop / delete / after a failed start.
func (s *VMService) clearVMHandles(vm model.VMRecord) {
if s == nil {
return
}
s.ensureHandleCache()
s.handles.clear(vm.ID)
removeHandlesFile(vm.Runtime.VMDir)
}
// vmAlive is the canonical "is this VM actually running?" check.
// Unlike the old `system.ProcessRunning(vm.Runtime.PID, apiSock)`
// pattern, this reads the PID from the handle cache — which is
// authoritative in-process — and verifies the PID against the api
// socket so a recycled PID can't false-positive.
func (s *VMService) vmAlive(vm model.VMRecord) bool {
if vm.State != model.VMStateRunning {
return false
}
h := s.vmHandles(vm.ID)
if h.PID <= 0 {
return false
}
return system.ProcessRunning(h.PID, vm.Runtime.APISockPath)
}
// rediscoverHandles loads what the last daemon start knew about a VM
// from its handles.json scratch file and verifies the firecracker
// process is still alive. Returns:
//
// - handles: the scratch-file contents (zero-value if no file).
// ALWAYS returned, even when alive=false, because the caller
// needs them to tear down kernel state (dm-snapshot, loops, tap)
// that the previous daemon left behind when it died.
// - alive: true iff a firecracker process matching the api sock is
// currently running.
// - err: unexpected failure (file exists but is corrupt).
//
// Strategy: pgrep by api sock path first (handles the case where
// the daemon crashed but the PID changed on respawn — unlikely for
// firecracker, but cheap insurance); fall back to verifying the
// scratch file's PID directly.
func (s *VMService) rediscoverHandles(ctx context.Context, vm model.VMRecord) (model.VMHandles, bool, error) {
saved, _, err := readHandlesFile(vm.Runtime.VMDir)
if err != nil {
return model.VMHandles{}, false, err
}
apiSock := vm.Runtime.APISockPath
if apiSock == "" {
return saved, false, nil
}
if pid, pidErr := s.net.findFirecrackerPID(ctx, apiSock); pidErr == nil && pid > 0 {
saved.PID = pid
return saved, true, nil
}
if saved.PID > 0 && system.ProcessRunning(saved.PID, apiSock) {
return saved, true, nil
}
return saved, false, nil
}