Cleanup identity for kernel objects was split across two sources of
truth: vm.Runtime (DB-backed, durable) held paths and the guest IP,
but the TAP name lived only in the in-process handle cache + the
best-effort handles.json scratch file next to the VM dir. Every
other cleanup-identifying datum has a fallback — firecracker PID
can be rediscovered via `pgrep -f <apiSock>`, loops via losetup, dm
name from the deterministic ShortID(vm.ID). The tap is the one
truly cache-only datum (allocated from a pool, not derivable).
That made NAT teardown fragile:
- daemon crash between `acquireTap` and the handles.json write
- handles.json corrupt on the next daemon start
- partial cleanup that already zeroed the cache
In any of those cases natCapability.Cleanup short-circuited
("skipping nat cleanup without runtime network handles") and the
per-VM POSTROUTING MASQUERADE + the two FORWARD rules keyed off
the tap would leak. The VM row in the DB still existed, so a retry
couldn't close the loop — the tap name was simply gone.
Fix: mirror TapDevice onto model.VMRuntime (serialised via the
existing runtime_json column, omitempty so existing rows upgrade
cleanly). Set it in startVMLocked right next to the
s.setVMHandles call that seeds the in-memory cache; clear it at
every post-cleanup reset site (stop normal path + stop stale
branch, kill normal path + kill stale branch, cleanupOnErr in
start, reconcile's stale-vm branch, the stats poller's auto-stop
path).
Fallbacks now cascade:
- natCapability.Cleanup: handles cache → Runtime.TapDevice
- cleanupRuntime (releaseTap): handles cache → Runtime.TapDevice
Both surfaces refuse gracefully (old behaviour) only when neither
source has a value, which really does mean "no tap was ever
allocated for this VM" rather than "we lost track of it."
Test: TestNATCapabilityCleanup_FallsBackToRuntimeTapDevice clears
the handle cache, sets vm.Runtime.TapDevice, and asserts Cleanup
reaches the runner — the exact scenario the review flagged as a
plausible leak and the exact code path that now guarantees it
doesn't.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
253 lines
8.5 KiB
Go
253 lines
8.5 KiB
Go
package model
|
|
|
|
import (
|
|
"crypto/rand"
|
|
"encoding/hex"
|
|
"errors"
|
|
"fmt"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
const (
|
|
DefaultBridgeName = "br-fc"
|
|
DefaultBridgeIP = "172.16.0.1"
|
|
DefaultCIDR = "24"
|
|
DefaultDNS = "1.1.1.1"
|
|
DefaultSystemOverlaySize = 8 * 1024 * 1024 * 1024
|
|
DefaultWorkDiskSize = 8 * 1024 * 1024 * 1024
|
|
DefaultMemoryMiB = 2048
|
|
DefaultVCPUCount = 2
|
|
DefaultStatsPollInterval = 10 * time.Second
|
|
DefaultStaleSweepInterval = 1 * time.Minute
|
|
MaxDiskBytes int64 = 128 * 1024 * 1024 * 1024
|
|
)
|
|
|
|
type VMState string
|
|
|
|
const (
|
|
VMStateCreated VMState = "created"
|
|
VMStateRunning VMState = "running"
|
|
VMStateStopped VMState = "stopped"
|
|
VMStateError VMState = "error"
|
|
)
|
|
|
|
type DaemonConfig struct {
|
|
LogLevel string
|
|
FirecrackerBin string
|
|
SSHKeyPath string
|
|
AutoStopStaleAfter time.Duration
|
|
StatsPollInterval time.Duration
|
|
BridgeName string
|
|
BridgeIP string
|
|
CIDR string
|
|
TapPoolSize int
|
|
DefaultDNS string
|
|
DefaultImageName string
|
|
FileSync []FileSyncEntry
|
|
VMDefaults VMDefaultsOverride
|
|
}
|
|
|
|
// FileSyncEntry is a user-declared host→guest file or directory copy
|
|
// applied to each VM's work disk at vm create time. Host is expanded
|
|
// against the host user's $HOME for "~/..."; Guest is expanded
|
|
// against /root (banger VMs are single-user root). If the host path
|
|
// is a directory, it's copied recursively; if it's a file, it's
|
|
// copied as a file. Missing host paths are a soft skip (warned, not
|
|
// fatal). Mode defaults to 0600 for files and 0755 for directories.
|
|
type FileSyncEntry struct {
|
|
Host string
|
|
Guest string
|
|
Mode string
|
|
}
|
|
|
|
type Image struct {
|
|
ID string `json:"id"`
|
|
Name string `json:"name"`
|
|
Managed bool `json:"managed"`
|
|
ArtifactDir string `json:"artifact_dir,omitempty"`
|
|
RootfsPath string `json:"rootfs_path"`
|
|
WorkSeedPath string `json:"work_seed_path,omitempty"`
|
|
KernelPath string `json:"kernel_path"`
|
|
InitrdPath string `json:"initrd_path,omitempty"`
|
|
ModulesDir string `json:"modules_dir,omitempty"`
|
|
BuildSize string `json:"build_size,omitempty"`
|
|
SeededSSHPublicKeyFingerprint string `json:"seeded_ssh_public_key_fingerprint,omitempty"`
|
|
Docker bool `json:"docker"`
|
|
CreatedAt time.Time `json:"created_at"`
|
|
UpdatedAt time.Time `json:"updated_at"`
|
|
}
|
|
|
|
type VMSpec struct {
|
|
VCPUCount int `json:"vcpu_count"`
|
|
MemoryMiB int `json:"memory_mib"`
|
|
SystemOverlaySizeByte int64 `json:"system_overlay_size_bytes"`
|
|
WorkDiskSizeBytes int64 `json:"work_disk_size_bytes"`
|
|
NATEnabled bool `json:"nat_enabled"`
|
|
}
|
|
|
|
// VMRuntime holds the durable runtime state that the daemon needs
|
|
// to reach a VM: identity, declared state, and deterministic derived
|
|
// paths. Transient kernel/process handles (PID, tap, loop devices,
|
|
// dm-snapshot names) live on VMHandles, NOT here — the daemon keeps
|
|
// them in an in-memory cache backed by a per-VM handles.json scratch
|
|
// file, so a daemon restart rebuilds them from OS state rather than
|
|
// trusting whatever was last written into a SQLite column.
|
|
//
|
|
// Everything in VMRuntime is safe to persist: the paths are
|
|
// deterministic from (VM ID, layout) and survive restart unchanged;
|
|
// GuestIP and DNSName are assigned at create time and never move;
|
|
// LastError carries the last failure message for debugging. State
|
|
// mirrors VMRecord.State.
|
|
type VMRuntime struct {
|
|
State VMState `json:"state"`
|
|
GuestIP string `json:"guest_ip"`
|
|
APISockPath string `json:"api_sock_path,omitempty"`
|
|
VSockPath string `json:"vsock_path,omitempty"`
|
|
VSockCID uint32 `json:"vsock_cid,omitempty"`
|
|
LogPath string `json:"log_path,omitempty"`
|
|
MetricsPath string `json:"metrics_path,omitempty"`
|
|
DNSName string `json:"dns_name,omitempty"`
|
|
VMDir string `json:"vm_dir"`
|
|
// TapDevice mirrors VMHandles.TapDevice but persists across
|
|
// daemon restarts / handle-cache loss. NAT teardown needs the
|
|
// exact tap name to delete the FORWARD rules; if we only had
|
|
// the handle cache, a crash between tap acquire and handles.json
|
|
// write — or a corrupt handles.json on the next daemon start —
|
|
// would silently leak the rules. Storing it on the VM record
|
|
// makes cleanup correct as long as the VM row exists.
|
|
TapDevice string `json:"tap_device,omitempty"`
|
|
SystemOverlay string `json:"system_overlay_path"`
|
|
WorkDiskPath string `json:"work_disk_path"`
|
|
LastError string `json:"last_error,omitempty"`
|
|
}
|
|
|
|
type VMStats struct {
|
|
CollectedAt time.Time `json:"collected_at,omitempty"`
|
|
CPUPercent float64 `json:"cpu_percent,omitempty"`
|
|
RSSBytes int64 `json:"rss_bytes,omitempty"`
|
|
VSZBytes int64 `json:"vsz_bytes,omitempty"`
|
|
SystemOverlayBytes int64 `json:"system_overlay_bytes,omitempty"`
|
|
WorkDiskBytes int64 `json:"work_disk_bytes,omitempty"`
|
|
MetricsRaw map[string]any `json:"metrics_raw,omitempty"`
|
|
}
|
|
|
|
type VMRecord struct {
|
|
ID string `json:"id"`
|
|
Name string `json:"name"`
|
|
ImageID string `json:"image_id"`
|
|
State VMState `json:"state"`
|
|
CreatedAt time.Time `json:"created_at"`
|
|
UpdatedAt time.Time `json:"updated_at"`
|
|
LastTouchedAt time.Time `json:"last_touched_at"`
|
|
Spec VMSpec `json:"spec"`
|
|
Runtime VMRuntime `json:"runtime"`
|
|
Stats VMStats `json:"stats"`
|
|
}
|
|
|
|
type VMCreateRequest struct {
|
|
Name string
|
|
ImageName string
|
|
VCPUCount int
|
|
MemoryMiB int
|
|
SystemOverlaySizeByte int64
|
|
WorkDiskSizeBytes int64
|
|
NATEnabled bool
|
|
NoStart bool
|
|
}
|
|
|
|
type VMSetRequest struct {
|
|
IDOrName string
|
|
VCPUCount *int
|
|
MemoryMiB *int
|
|
WorkDiskSizeBytes *int64
|
|
NATEnabled *bool
|
|
}
|
|
|
|
type WorkspacePrepareMode string
|
|
|
|
const (
|
|
WorkspacePrepareModeShallowOverlay WorkspacePrepareMode = "shallow_overlay"
|
|
WorkspacePrepareModeFullCopy WorkspacePrepareMode = "full_copy"
|
|
WorkspacePrepareModeMetadataOnly WorkspacePrepareMode = "metadata_only"
|
|
)
|
|
|
|
type WorkspacePrepareResult struct {
|
|
VMID string `json:"vm_id"`
|
|
SourcePath string `json:"source_path"`
|
|
RepoRoot string `json:"repo_root"`
|
|
RepoName string `json:"repo_name"`
|
|
GuestPath string `json:"guest_path"`
|
|
Mode WorkspacePrepareMode `json:"mode"`
|
|
HeadCommit string `json:"head_commit,omitempty"`
|
|
CurrentBranch string `json:"current_branch,omitempty"`
|
|
BranchName string `json:"branch_name,omitempty"`
|
|
BaseCommit string `json:"base_commit,omitempty"`
|
|
PreparedAt time.Time `json:"prepared_at"`
|
|
}
|
|
|
|
func Now() time.Time {
|
|
return time.Now().UTC().Truncate(time.Second)
|
|
}
|
|
|
|
func NewID() (string, error) {
|
|
buf := make([]byte, 32)
|
|
if _, err := rand.Read(buf); err != nil {
|
|
return "", err
|
|
}
|
|
return hex.EncodeToString(buf), nil
|
|
}
|
|
|
|
func ParseSize(raw string) (int64, error) {
|
|
if raw == "" {
|
|
return 0, errors.New("size is required")
|
|
}
|
|
raw = strings.TrimSpace(strings.ToUpper(raw))
|
|
if raw == "" {
|
|
return 0, errors.New("size is required")
|
|
}
|
|
unit := raw[len(raw)-1]
|
|
multiplier := int64(1024 * 1024)
|
|
number := raw
|
|
switch unit {
|
|
case 'K':
|
|
multiplier = 1024
|
|
number = raw[:len(raw)-1]
|
|
case 'M':
|
|
multiplier = 1024 * 1024
|
|
number = raw[:len(raw)-1]
|
|
case 'G':
|
|
multiplier = 1024 * 1024 * 1024
|
|
number = raw[:len(raw)-1]
|
|
default:
|
|
if unit < '0' || unit > '9' {
|
|
return 0, fmt.Errorf("unsupported size suffix: %q", string(unit))
|
|
}
|
|
}
|
|
value, err := strconv.ParseInt(number, 10, 64)
|
|
if err != nil {
|
|
return 0, fmt.Errorf("parse size %q: %w", raw, err)
|
|
}
|
|
result := value * multiplier
|
|
if result <= 0 {
|
|
return 0, fmt.Errorf("size must be positive: %q", raw)
|
|
}
|
|
if result > MaxDiskBytes {
|
|
return 0, fmt.Errorf("size exceeds max of %d bytes", MaxDiskBytes)
|
|
}
|
|
return result, nil
|
|
}
|
|
|
|
func FormatSizeBytes(bytes int64) string {
|
|
switch {
|
|
case bytes%(1024*1024*1024) == 0:
|
|
return fmt.Sprintf("%dG", bytes/(1024*1024*1024))
|
|
case bytes%(1024*1024) == 0:
|
|
return fmt.Sprintf("%dM", bytes/(1024*1024))
|
|
case bytes%1024 == 0:
|
|
return fmt.Sprintf("%dK", bytes/1024)
|
|
default:
|
|
return strconv.FormatInt(bytes, 10)
|
|
}
|
|
}
|