banger/internal/daemon/vm_handles.go
Thales Maciel 700a1e6e60
cleanup: drop pre-v0.1 migration scaffolding + legacy-behavior refs
banger hasn't shipped a public release — every "legacy", "pre-opt-in",
"previously", "migration note", "no longer" reference in the tree is
pinning against a state no real user's install has ever been in.
That scaffolding has weight: it's a coordinate system future readers
have to decode, and it keeps dead code alive.

Removed (code):
- internal/daemon/ssh_client_config.go
    - vmSSHConfigIncludeBegin / vmSSHConfigIncludeEnd constants and
      every `removeManagedBlock(existing, vm...)` call they enabled
      (legacy inline `Host *.vm` block scrub)
    - cleanupLegacySSHConfigDir (+ its caller in syncVMSSHClientConfig)
      — wiped a pre-opt-in sibling file under $ConfigDir/ssh
    - sameDirOrParent + resolvePathForComparison — only ever used
      by cleanupLegacySSHConfigDir
    - the "also check legacy marker" fallback in
      UserSSHIncludeInstalled / UninstallUserSSHInclude
- internal/store/migrations.go
    - migrateDropDeadImageColumns (migration 2) + its slice entry
    - dropColumnIfExists (orphaned after the above)
    - addColumnIfMissing + the whole "columns added across the pre-
      versioning lifetime" block at the end of migrateBaseline —
      subsumed into the baseline CREATE TABLE
    - `packages_path TEXT` column on the images table (the
      throwaway migration 2 dropped it, but there was never any
      reader)
- internal/daemon/vm.go
    - vmDNSRecordName local wrapper — was justified as "avoid
      pulling vmdns into every file"; three of four callers already
      imported vmdns directly, so inline the one stray call
- internal/cli/cli_test.go
    - TestLegacyRemovedCommandIsRejected (`tui` subcommand never
      shipped)

Removed / simplified (tests):
- ssh_client_config_test.go: dropped TestSameDirOrParentHandlesSymlinks,
  TestSyncVMSSHClientConfigPreservesUserKeyInLegacyDir,
  TestSyncVMSSHClientConfigNarrowsCleanupToLegacyFile,
  TestSyncVMSSHClientConfigLeavesUnexpectedLegacyContents,
  TestInstallUserSSHIncludeMigratesLegacyInlineBlock, plus the
  "legacy posture" regression strings in the remaining happy-path
  test; TestUninstallUserSSHIncludeRemovesBothMarkerBlocks collapsed
  to a single-block test
- migrations_test.go: dropped TestMigrateDropDeadImageColumns_AcrossInstallPaths,
  TestDropColumnIfExistsIsIdempotent; TestOpenReadOnlyDoesNotRunMigrations
  simplified to test against the baseline marker

Removed (docs):
- README.md "**Migration note.**" blockquote about the SSH-key path move
- docs/advanced.md parenthetical "(the old behaviour)"

Reworded (comments):
- Dropped "Previously this file also contained LogLevel DEBUG3..."
  history from vm_disk.go's sshdGuestConfig doc
- Dropped "Call sites that previously read vm.Runtime.{PID,...}"
  from vm_handles.go; now documents the current contract
- Dropped "Pre-v0.1 the defaults are" scaffolding in doctor_test.go
- Dropped "no longer does its own git inspection" phrasing in vm_run.go
- Dropped the "(also cleans up legacy inline block from pre-opt-in
  builds)" aside on the `ssh-config` CLI docstring
- Renamed test var `legacyKey` → `existingKey` in vm_test.go; its
  purpose was "pre-existing authorized_keys line," not banger-legacy

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-23 13:56:32 -03:00

211 lines
6.3 KiB
Go

package daemon
import (
"context"
"encoding/json"
"errors"
"fmt"
"os"
"path/filepath"
"sync"
"banger/internal/model"
"banger/internal/system"
)
// handleCache is the daemon's in-memory map of per-VM transient
// handles. It is the sole runtime source of truth for PID / tap /
// loop / DM state — persistent storage (the per-VM handles.json
// scratch file) exists only so the daemon can rebuild the cache
// after a restart.
type handleCache struct {
mu sync.RWMutex
m map[string]model.VMHandles
}
func newHandleCache() *handleCache {
return &handleCache{m: make(map[string]model.VMHandles)}
}
// get returns the cached handles for vmID and whether an entry
// exists. A missing entry means "no live handles tracked," which is
// the correct state for stopped VMs.
func (c *handleCache) get(vmID string) (model.VMHandles, bool) {
c.mu.RLock()
defer c.mu.RUnlock()
h, ok := c.m[vmID]
return h, ok
}
func (c *handleCache) set(vmID string, h model.VMHandles) {
c.mu.Lock()
defer c.mu.Unlock()
c.m[vmID] = h
}
func (c *handleCache) clear(vmID string) {
c.mu.Lock()
defer c.mu.Unlock()
delete(c.m, vmID)
}
// handlesFilePath returns the scratch file path inside the VM
// directory where the daemon writes the last-known handles.
func handlesFilePath(vmDir string) string {
return filepath.Join(vmDir, "handles.json")
}
// writeHandlesFile persists h to <vmDir>/handles.json. Called
// whenever the daemon successfully transitions a VM to running
// (after all handles are acquired). Best-effort: a write failure is
// logged, not propagated — the in-memory cache is authoritative
// while the daemon is up.
func writeHandlesFile(vmDir string, h model.VMHandles) error {
if vmDir == "" {
return errors.New("vm dir is required")
}
if err := os.MkdirAll(vmDir, 0o755); err != nil {
return err
}
data, err := json.MarshalIndent(h, "", " ")
if err != nil {
return err
}
return os.WriteFile(handlesFilePath(vmDir), data, 0o600)
}
// readHandlesFile loads the scratch file written at the last start.
// Returns a zero-value handles + (false, nil) if the file doesn't
// exist — that's the normal case for stopped VMs.
func readHandlesFile(vmDir string) (model.VMHandles, bool, error) {
if vmDir == "" {
return model.VMHandles{}, false, nil
}
data, err := os.ReadFile(handlesFilePath(vmDir))
if os.IsNotExist(err) {
return model.VMHandles{}, false, nil
}
if err != nil {
return model.VMHandles{}, false, err
}
var h model.VMHandles
if err := json.Unmarshal(data, &h); err != nil {
return model.VMHandles{}, false, fmt.Errorf("parse handles.json: %w", err)
}
return h, true, nil
}
func removeHandlesFile(vmDir string) {
if vmDir == "" {
return
}
_ = os.Remove(handlesFilePath(vmDir))
}
// ensureHandleCache lazily constructs the cache so direct
// `&Daemon{}` literals (common in tests) don't have to initialise
// it. Production code goes through Open(), which also builds it.
func (s *VMService) ensureHandleCache() {
if s.handles == nil {
s.handles = newHandleCache()
}
}
// setVMHandlesInMemory is a test-only cache seed that skips the
// scratch-file write. Production callers should use setVMHandles so
// the filesystem survives a daemon restart.
func (s *VMService) setVMHandlesInMemory(vmID string, h model.VMHandles) {
if s == nil {
return
}
s.ensureHandleCache()
s.handles.set(vmID, h)
}
// vmHandles returns the cached handles for vm (zero-value if no
// entry). The in-process handle cache is the authoritative source
// for PID / loops / dm-name — VMRecord.Runtime holds only paths.
func (s *VMService) vmHandles(vmID string) model.VMHandles {
if s == nil {
return model.VMHandles{}
}
s.ensureHandleCache()
h, _ := s.handles.get(vmID)
return h
}
// setVMHandles updates the in-memory cache AND the per-VM scratch
// file. Scratch-file errors are logged but not returned; the cache
// write is authoritative while the daemon is alive.
func (s *VMService) setVMHandles(vm model.VMRecord, h model.VMHandles) {
if s == nil {
return
}
s.ensureHandleCache()
s.handles.set(vm.ID, h)
if err := writeHandlesFile(vm.Runtime.VMDir, h); err != nil && s.logger != nil {
s.logger.Warn("persist handles.json failed", "vm_id", vm.ID, "error", err.Error())
}
}
// clearVMHandles drops the cache entry and removes the scratch
// file. Called on stop / delete / after a failed start.
func (s *VMService) clearVMHandles(vm model.VMRecord) {
if s == nil {
return
}
s.ensureHandleCache()
s.handles.clear(vm.ID)
removeHandlesFile(vm.Runtime.VMDir)
}
// vmAlive is the canonical "is this VM actually running?" check.
// Unlike the old `system.ProcessRunning(vm.Runtime.PID, apiSock)`
// pattern, this reads the PID from the handle cache — which is
// authoritative in-process — and verifies the PID against the api
// socket so a recycled PID can't false-positive.
func (s *VMService) vmAlive(vm model.VMRecord) bool {
if vm.State != model.VMStateRunning {
return false
}
h := s.vmHandles(vm.ID)
if h.PID <= 0 {
return false
}
return system.ProcessRunning(h.PID, vm.Runtime.APISockPath)
}
// rediscoverHandles loads what the last daemon start knew about a VM
// from its handles.json scratch file and verifies the firecracker
// process is still alive. Returns:
//
// - handles: the scratch-file contents (zero-value if no file).
// ALWAYS returned, even when alive=false, because the caller
// needs them to tear down kernel state (dm-snapshot, loops, tap)
// that the previous daemon left behind when it died.
// - alive: true iff a firecracker process matching the api sock is
// currently running.
// - err: unexpected failure (file exists but is corrupt).
//
// Strategy: pgrep by api sock path first (handles the case where
// the daemon crashed but the PID changed on respawn — unlikely for
// firecracker, but cheap insurance); fall back to verifying the
// scratch file's PID directly.
func (s *VMService) rediscoverHandles(ctx context.Context, vm model.VMRecord) (model.VMHandles, bool, error) {
saved, _, err := readHandlesFile(vm.Runtime.VMDir)
if err != nil {
return model.VMHandles{}, false, err
}
apiSock := vm.Runtime.APISockPath
if apiSock == "" {
return saved, false, nil
}
if pid, pidErr := s.net.findFirecrackerPID(ctx, apiSock); pidErr == nil && pid > 0 {
saved.PID = pid
return saved, true, nil
}
if saved.PID > 0 && system.ProcessRunning(saved.PID, apiSock) {
return saved, true, nil
}
return saved, false, nil
}