diff --git a/internal/daemon/ARCHITECTURE.md b/internal/daemon/ARCHITECTURE.md
index c8674e7..93f7d10 100644
--- a/internal/daemon/ARCHITECTURE.md
+++ b/internal/daemon/ARCHITECTURE.md
@@ -18,6 +18,13 @@ owning types:
   workspace operations on a single VM (two simultaneous tar imports
   would clobber each other) without touching `vmLocks`, so
   `vm stop` / `delete` / `restart` never queue behind a slow import.
+- `handles *handleCache` — in-memory map of per-VM transient kernel/
+  process handles (PID, tap device, loop devices, DM target). The
+  cache is rebuildable: each VM directory holds a small
+  `handles.json` scratch file that the daemon reads at startup to
+  reconstruct the cache and verify processes against `/proc` via
+  pgrep. Nothing in the durable `vms` SQLite row describes transient
+  kernel state. See `internal/daemon/vm_handles.go`.
 - `createVMMu sync.Mutex` — serialises `CreateVM` (guards name uniqueness
   + guest IP allocation window).
 - `imageOpsMu sync.Mutex` — serialises image-registry mutations
diff --git a/internal/daemon/capabilities.go b/internal/daemon/capabilities.go
index c1bbd25..b4c18cd 100644
--- a/internal/daemon/capabilities.go
+++ b/internal/daemon/capabilities.go
@@ -277,9 +277,10 @@ func (natCapability) Cleanup(ctx context.Context, d *Daemon, vm model.VMRecord)
 	if !vm.Spec.NATEnabled {
 		return nil
 	}
-	if strings.TrimSpace(vm.Runtime.GuestIP) == "" || strings.TrimSpace(vm.Runtime.TapDevice) == "" {
+	tap := d.vmHandles(vm.ID).TapDevice
+	if strings.TrimSpace(vm.Runtime.GuestIP) == "" || strings.TrimSpace(tap) == "" {
 		if d.logger != nil {
-			d.logger.Debug("skipping nat cleanup without runtime network handles", append(vmLogAttrs(vm), "guest_ip", vm.Runtime.GuestIP, "tap_device", vm.Runtime.TapDevice)...)
+			d.logger.Debug("skipping nat cleanup without runtime network handles", append(vmLogAttrs(vm), "guest_ip", vm.Runtime.GuestIP, "tap_device", tap)...)
 		}
 		return nil
 	}
@@ -290,7 +291,7 @@ func (natCapability) ApplyConfigChange(ctx context.Context, d *Daemon, before, a
 	if before.Spec.NATEnabled == after.Spec.NATEnabled {
 		return nil
 	}
-	if after.State != model.VMStateRunning || !system.ProcessRunning(after.Runtime.PID, after.Runtime.APISockPath) {
+	if !d.vmAlive(after) {
 		return nil
 	}
 	return d.ensureNAT(ctx, after, after.Spec.NATEnabled)
diff --git a/internal/daemon/daemon.go b/internal/daemon/daemon.go
index 0c252e5..a548294 100644
--- a/internal/daemon/daemon.go
+++ b/internal/daemon/daemon.go
@@ -44,7 +44,14 @@ type Daemon struct {
 	// other's tar streams). It is a SEPARATE scope from vmLocks so
 	// slow guest I/O — SSH dial, tar upload, chmod — does not block
 	// vm stop/delete/restart. See ARCHITECTURE.md.
-	workspaceLocks           vmLockSet
+	workspaceLocks vmLockSet
+	// handles caches per-VM transient kernel/process handles (PID,
+	// tap device, loop devices, DM name/device). Populated at vm
+	// start and at daemon startup reconcile; cleared on stop/delete.
+	// See internal/daemon/vm_handles.go — persistent durable state
+	// lives in the store, this is rebuildable from a per-VM
+	// handles.json scratch file and OS inspection.
+	handles                  *handleCache
 	sessions                 sessionRegistry
 	tapPool                  tapPool
 	closing                  chan struct{}
@@ -94,6 +101,7 @@ func Open(ctx context.Context) (d *Daemon, err error) {
 		logger:   logger,
 		closing:  make(chan struct{}),
 		pid:      os.Getpid(),
+		handles:  newHandleCache(),
 		sessions: newSessionRegistry(),
 	}
 	d.ensureVMSSHClientConfig()
@@ -382,7 +390,7 @@ func (d *Daemon) dispatch(ctx context.Context, req rpc.Request) rpc.Response {
 		if err != nil {
 			return rpc.NewError("not_found", err.Error())
 		}
-		if vm.State != model.VMStateRunning || !system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
+		if !d.vmAlive(vm) {
 			return rpc.NewError("not_running", fmt.Sprintf("vm %s is not running", vm.Name))
 		}
 		return marshalResultOrError(api.VMSSHResult{Name: vm.Name, GuestIP: vm.Runtime.GuestIP}, nil)
@@ -609,16 +617,32 @@ func (d *Daemon) reconcile(ctx context.Context) error {
 	for _, vm := range vms {
 		if err := d.withVMLockByIDErr(ctx, vm.ID, func(vm model.VMRecord) error {
 			if vm.State != model.VMStateRunning {
+				// Belt-and-braces: a stopped VM should never have a
+				// scratch file or a cache entry. Clean up anything
+				// left by an ungraceful previous daemon crash.
+				d.clearVMHandles(vm)
 				return nil
 			}
-			if system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
+			// Rebuild the in-memory handle cache by loading the per-VM
+			// scratch file and verifying the firecracker process is
+			// still alive.
+			h, alive, err := d.rediscoverHandles(ctx, vm)
+			if err != nil && d.logger != nil {
+				d.logger.Warn("rediscover handles failed", "vm_id", vm.ID, "error", err.Error())
+			}
+			// Either way, seed the cache with what the scratch file
+			// claimed. If alive, subsequent vmAlive() calls pass; if
+			// not, cleanupRuntime needs these handles to know which
+			// kernel resources (DM / loops / tap) to tear down.
+			d.setVMHandlesInMemory(vm.ID, h)
+			if alive {
 				return nil
 			}
 			op.stage("stale_vm", vmLogAttrs(vm)...)
 			_ = d.cleanupRuntime(ctx, vm, true)
 			vm.State = model.VMStateStopped
 			vm.Runtime.State = model.VMStateStopped
-			clearRuntimeHandles(&vm)
+			d.clearVMHandles(vm)
 			vm.UpdatedAt = model.Now()
 			return d.store.UpsertVM(ctx, vm)
 		}); err != nil {
diff --git a/internal/daemon/dashboard.go b/internal/daemon/dashboard.go
index b0953b5..cfd42d9 100644
--- a/internal/daemon/dashboard.go
+++ b/internal/daemon/dashboard.go
@@ -52,7 +52,7 @@ func (d *Daemon) DashboardSummary(ctx context.Context) (api.DashboardSummary, er
 		summary.Banger.ConfiguredDiskBytes += vm.Spec.WorkDiskSizeBytes
 		summary.Banger.UsedSystemOverlayBytes += vm.Stats.SystemOverlayBytes
 		summary.Banger.UsedWorkDiskBytes += vm.Stats.WorkDiskBytes
-		if vm.State == model.VMStateRunning && system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
+		if d.vmAlive(vm) {
 			summary.Banger.RunningVMCount++
 			summary.Banger.RunningCPUPercent += vm.Stats.CPUPercent
 			summary.Banger.RunningRSSBytes += vm.Stats.RSSBytes
diff --git a/internal/daemon/guest_sessions.go b/internal/daemon/guest_sessions.go
index 0c15739..6a4cddb 100644
--- a/internal/daemon/guest_sessions.go
+++ b/internal/daemon/guest_sessions.go
@@ -74,7 +74,7 @@ func (d *Daemon) refreshGuestSession(ctx context.Context, vm model.VMRecord, s m
 		return s, err
 	}
 	original := s
-	session.ApplyStateSnapshot(&s, snapshot, vm.State == model.VMStateRunning && system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath))
+	session.ApplyStateSnapshot(&s, snapshot, d.vmAlive(vm))
 	if session.StateChanged(original, s) {
 		s.UpdatedAt = model.Now()
 		if err := d.store.UpsertGuestSession(ctx, s); err != nil {
@@ -85,7 +85,7 @@ func (d *Daemon) refreshGuestSession(ctx context.Context, vm model.VMRecord, s m
 }
 
 func (d *Daemon) inspectGuestSessionState(ctx context.Context, vm model.VMRecord, s model.GuestSession) (session.StateSnapshot, error) {
-	if vm.State == model.VMStateRunning && system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
+	if d.vmAlive(vm) {
 		client, err := guest.Dial(ctx, net.JoinHostPort(vm.Runtime.GuestIP, "22"), d.config.SSHKeyPath)
 		if err != nil {
 			return session.StateSnapshot{}, err
diff --git a/internal/daemon/guest_sessions_test.go b/internal/daemon/guest_sessions_test.go
index 5ec5e1b..bbe5f13 100644
--- a/internal/daemon/guest_sessions_test.go
+++ b/internal/daemon/guest_sessions_test.go
@@ -94,7 +94,6 @@ func TestSendToGuestSession_HappyPath(t *testing.T) {
 	vm := testVM("sendbox", "image-send", "172.16.0.88")
 	vm.State = model.VMStateRunning
 	vm.Runtime.State = model.VMStateRunning
-	vm.Runtime.PID = firecracker.Process.Pid
 	vm.Runtime.APISockPath = apiSock
 	upsertDaemonVM(t, ctx, db, vm)
 
@@ -105,6 +104,7 @@ func TestSendToGuestSession_HappyPath(t *testing.T) {
 
 	fake := &recordingGuestSSHClient{}
 	d := newSendTestDaemon(t, db, fake)
+	d.setVMHandlesInMemory(vm.ID, model.VMHandles{PID: firecracker.Process.Pid})
 
 	payload := []byte(`{"type":"abort"}` + "\n")
 	result, err := d.SendToGuestSession(ctx, api.GuestSessionSendParams{
@@ -159,7 +159,6 @@ func TestSendToGuestSession_EmptyPayload(t *testing.T) {
 	vm := testVM("sendbox-empty", "image-send", "172.16.0.89")
 	vm.State = model.VMStateRunning
 	vm.Runtime.State = model.VMStateRunning
-	vm.Runtime.PID = firecracker.Process.Pid
 	vm.Runtime.APISockPath = apiSock
 	upsertDaemonVM(t, ctx, db, vm)
 
@@ -170,6 +169,7 @@ func TestSendToGuestSession_EmptyPayload(t *testing.T) {
 
 	fake := &recordingGuestSSHClient{}
 	d := newSendTestDaemon(t, db, fake)
+	d.setVMHandlesInMemory(vm.ID, model.VMHandles{PID: firecracker.Process.Pid})
 
 	result, err := d.SendToGuestSession(ctx, api.GuestSessionSendParams{
 		VMIDOrName:      vm.Name,
@@ -423,7 +423,6 @@ func TestPrepareWorkspaceThenStartGuestSessionPassesCWDPreflight(t *testing.T) {
 	vm := testVM("pi-devbox", "image-pi", "172.16.0.77")
 	vm.State = model.VMStateRunning
 	vm.Runtime.State = model.VMStateRunning
-	vm.Runtime.PID = firecracker.Process.Pid
 	vm.Runtime.APISockPath = apiSock
 	upsertDaemonVM(t, ctx, db, vm)
 
@@ -433,6 +432,7 @@ func TestPrepareWorkspaceThenStartGuestSessionPassesCWDPreflight(t *testing.T) {
 		config: model.DaemonConfig{SSHKeyPath: filepath.Join(t.TempDir(), "id_ed25519")},
 		logger: slog.New(slog.NewTextHandler(io.Discard, nil)),
 	}
+	d.setVMHandlesInMemory(vm.ID, model.VMHandles{PID: firecracker.Process.Pid})
 	d.guestWaitForSSH = func(context.Context, string, string, time.Duration) error { return nil }
 	d.guestDial = func(context.Context, string, string) (guestSSHClient, error) { return fakeClient, nil }
 	d.waitForGuestSessionReady = func(_ context.Context, _ model.VMRecord, session model.GuestSession) (model.GuestSession, error) {
diff --git a/internal/daemon/logger.go b/internal/daemon/logger.go
index abf1582..8771609 100644
--- a/internal/daemon/logger.go
+++ b/internal/daemon/logger.go
@@ -98,6 +98,10 @@ func (o operationLog) log(level slog.Level, msg string, attrs ...any) {
 	o.logger.Log(context.Background(), level, msg, base...)
 }
 
+// vmLogAttrs returns the durable identifying fields for a VM that
+// are always safe to log. Transient handles (PID, tap device) moved
+// off VMRecord when the schema was split; lifecycle ops log those
+// explicitly on the events where they matter (e.g. wait_for_exit).
 func vmLogAttrs(vm model.VMRecord) []any {
 	attrs := []any{
 		"vm_id", vm.ID,
@@ -107,15 +111,9 @@ func vmLogAttrs(vm model.VMRecord) []any {
 	if vm.Runtime.GuestIP != "" {
 		attrs = append(attrs, "guest_ip", vm.Runtime.GuestIP)
 	}
-	if vm.Runtime.TapDevice != "" {
-		attrs = append(attrs, "tap_device", vm.Runtime.TapDevice)
-	}
 	if vm.Runtime.APISockPath != "" {
 		attrs = append(attrs, "api_socket", vm.Runtime.APISockPath)
 	}
-	if vm.Runtime.PID > 0 {
-		attrs = append(attrs, "pid", vm.Runtime.PID)
-	}
 	if vm.Runtime.LogPath != "" {
 		attrs = append(attrs, "log_path", vm.Runtime.LogPath)
 	}
diff --git a/internal/daemon/nat.go b/internal/daemon/nat.go
index e38f6a3..b0d4231 100644
--- a/internal/daemon/nat.go
+++ b/internal/daemon/nat.go
@@ -11,7 +11,7 @@ import (
 type natRule = hostnat.Rule
 
 func (d *Daemon) ensureNAT(ctx context.Context, vm model.VMRecord, enable bool) error {
-	return hostnat.Ensure(ctx, d.runner, vm.Runtime.GuestIP, vm.Runtime.TapDevice, enable)
+	return hostnat.Ensure(ctx, d.runner, vm.Runtime.GuestIP, d.vmHandles(vm.ID).TapDevice, enable)
 }
 
 func (d *Daemon) validateNATPrereqs(ctx context.Context) (string, error) {
@@ -32,8 +32,12 @@ func parseDefaultUplink(output string) (string, error) {
 	return hostnat.ParseDefaultUplink(output)
 }
 
-func natRulesForVM(vm model.VMRecord, uplink string) ([]natRule, error) {
-	return hostnat.Rules(vm.Runtime.GuestIP, vm.Runtime.TapDevice, uplink)
+// natRulesForVM builds the iptables rule set for vm + tap + uplink.
+// tap is passed explicitly (rather than read from a handle cache)
+// because natRulesForVM has no Daemon receiver — it's usable from
+// test helpers that build rule expectations without a daemon.
+func natRulesForVM(vm model.VMRecord, tap, uplink string) ([]natRule, error) {
+	return hostnat.Rules(vm.Runtime.GuestIP, tap, uplink)
 }
 
 func natRuleArgs(action string, rule natRule) []string {
diff --git a/internal/daemon/nat_test.go b/internal/daemon/nat_test.go
index d5a01d0..e844e05 100644
--- a/internal/daemon/nat_test.go
+++ b/internal/daemon/nat_test.go
@@ -33,11 +33,10 @@ func TestNATRulesForVM(t *testing.T) {
 
 	vm := model.VMRecord{
 		Runtime: model.VMRuntime{
-			GuestIP:   "172.16.0.8",
-			TapDevice: "tap-fc-abcd1234",
+			GuestIP: "172.16.0.8",
 		},
 	}
-	rules, err := natRulesForVM(vm, "wlan0")
+	rules, err := natRulesForVM(vm, "tap-fc-abcd1234", "wlan0")
 	if err != nil {
 		t.Fatalf("natRulesForVM returned error: %v", err)
 	}
@@ -61,30 +60,25 @@ func TestNATRulesForVMRequiresRuntimeData(t *testing.T) {
 	tests := []struct {
 		name   string
 		vm     model.VMRecord
+		tap    string
 		uplink string
 	}{
 		{
-			name: "guest ip",
-			vm: model.VMRecord{
-				Runtime: model.VMRuntime{TapDevice: "tap-fc-abcd1234"},
-			},
+			name:   "guest ip",
+			vm:     model.VMRecord{},
+			tap:    "tap-fc-abcd1234",
 			uplink: "eth0",
 		},
 		{
-			name: "tap",
-			vm: model.VMRecord{
-				Runtime: model.VMRuntime{GuestIP: "172.16.0.8"},
-			},
+			name:   "tap",
+			vm:     model.VMRecord{Runtime: model.VMRuntime{GuestIP: "172.16.0.8"}},
+			tap:    "",
 			uplink: "eth0",
 		},
 		{
-			name: "uplink",
-			vm: model.VMRecord{
-				Runtime: model.VMRuntime{
-					GuestIP:   "172.16.0.8",
-					TapDevice: "tap-fc-abcd1234",
-				},
-			},
+			name:   "uplink",
+			vm:     model.VMRecord{Runtime: model.VMRuntime{GuestIP: "172.16.0.8"}},
+			tap:    "tap-fc-abcd1234",
 			uplink: "",
 		},
 	}
@@ -93,7 +87,7 @@ func TestNATRulesForVMRequiresRuntimeData(t *testing.T) {
 		tt := tt
 		t.Run(tt.name, func(t *testing.T) {
 			t.Parallel()
-			if _, err := natRulesForVM(tt.vm, tt.uplink); err == nil {
+			if _, err := natRulesForVM(tt.vm, tt.tap, tt.uplink); err == nil {
 				t.Fatalf("expected natRulesForVM to fail for missing %s", tt.name)
 			}
 		})
diff --git a/internal/daemon/ports.go b/internal/daemon/ports.go
index 0c472f0..40ab0c0 100644
--- a/internal/daemon/ports.go
+++ b/internal/daemon/ports.go
@@ -15,7 +15,6 @@ import (
 
 	"banger/internal/api"
 	"banger/internal/model"
-	"banger/internal/system"
 	"banger/internal/vmdns"
 	"banger/internal/vsockagent"
 )
@@ -29,7 +28,7 @@ func (d *Daemon) PortsVM(ctx context.Context, idOrName string) (result api.VMPor
 		if result.DNSName == "" && strings.TrimSpace(vm.Name) != "" {
 			result.DNSName = vmdns.RecordName(vm.Name)
 		}
-		if vm.State != model.VMStateRunning || !system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
+		if !d.vmAlive(vm) {
 			return model.VMRecord{}, fmt.Errorf("vm %s is not running", vm.Name)
 		}
 		if strings.TrimSpace(vm.Runtime.GuestIP) == "" {
diff --git a/internal/daemon/session_attach.go b/internal/daemon/session_attach.go
index f5301ee..9fef26b 100644
--- a/internal/daemon/session_attach.go
+++ b/internal/daemon/session_attach.go
@@ -15,7 +15,6 @@ import (
 	"banger/internal/guest"
 	"banger/internal/model"
 	"banger/internal/sessionstream"
-	"banger/internal/system"
 )
 
 func (d *Daemon) BeginGuestSessionAttach(ctx context.Context, params api.GuestSessionAttachBeginParams) (api.GuestSessionAttachBeginResult, error) {
@@ -162,7 +161,7 @@ func (d *Daemon) attachGuestSessionBridge(session model.GuestSession, controller
 	if err != nil {
 		return err
 	}
-	if vm.State != model.VMStateRunning || !system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
+	if !d.vmAlive(vm) {
 		return fmt.Errorf("vm %q is not running", vm.Name)
 	}
 	address := net.JoinHostPort(vm.Runtime.GuestIP, "22")
diff --git a/internal/daemon/session_lifecycle.go b/internal/daemon/session_lifecycle.go
index b22d9e2..18e4b02 100644
--- a/internal/daemon/session_lifecycle.go
+++ b/internal/daemon/session_lifecycle.go
@@ -13,7 +13,6 @@ import (
 	sess "banger/internal/daemon/session"
 	"banger/internal/guest"
 	"banger/internal/model"
-	"banger/internal/system"
 )
 
 func (d *Daemon) StartGuestSession(ctx context.Context, params api.GuestSessionStartParams) (model.GuestSession, error) {
@@ -29,7 +28,7 @@ func (d *Daemon) StartGuestSession(ctx context.Context, params api.GuestSessionS
 	}
 	var created model.GuestSession
 	_, err := d.withVMLockByRef(ctx, params.VMIDOrName, func(vm model.VMRecord) (model.VMRecord, error) {
-		if vm.State != model.VMStateRunning || !system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
+		if !d.vmAlive(vm) {
 			return model.VMRecord{}, fmt.Errorf("vm %q is not running", vm.Name)
 		}
 		session, err := d.startGuestSessionLocked(ctx, vm, params, stdinMode)
@@ -184,7 +183,7 @@ func (d *Daemon) signalGuestSession(ctx context.Context, params api.GuestSession
 	if session.Status == model.GuestSessionStatusExited || session.Status == model.GuestSessionStatusFailed {
 		return session, nil
 	}
-	if vm.State != model.VMStateRunning || !system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
+	if !d.vmAlive(vm) {
 		session.Status = model.GuestSessionStatusFailed
 		session.LastError = "vm is not running"
 		now := model.Now()
diff --git a/internal/daemon/session_stream.go b/internal/daemon/session_stream.go
index 93a7344..fea9c54 100644
--- a/internal/daemon/session_stream.go
+++ b/internal/daemon/session_stream.go
@@ -59,7 +59,7 @@ func (d *Daemon) SendToGuestSession(ctx context.Context, params api.GuestSession
 	if session.Status != model.GuestSessionStatusRunning {
 		return api.GuestSessionSendResult{}, fmt.Errorf("session is not running (status=%s)", session.Status)
 	}
-	if vm.State != model.VMStateRunning || !system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
+	if !d.vmAlive(vm) {
 		return api.GuestSessionSendResult{}, fmt.Errorf("vm %q is not running", vm.Name)
 	}
 	if len(params.Payload) == 0 {
@@ -89,7 +89,7 @@ func (d *Daemon) SendToGuestSession(ctx context.Context, params api.GuestSession
 }
 
 func (d *Daemon) readGuestSessionLog(ctx context.Context, vm model.VMRecord, session model.GuestSession, stream string, tailLines int) (string, error) {
-	if vm.State == model.VMStateRunning && system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
+	if d.vmAlive(vm) {
 		client, err := guest.Dial(ctx, net.JoinHostPort(vm.Runtime.GuestIP, "22"), d.config.SSHKeyPath)
 		if err != nil {
 			return "", err
diff --git a/internal/daemon/tap_pool.go b/internal/daemon/tap_pool.go
index 75cf44c..9d5e172 100644
--- a/internal/daemon/tap_pool.go
+++ b/internal/daemon/tap_pool.go
@@ -28,7 +28,7 @@ func (d *Daemon) initializeTapPool(ctx context.Context) error {
 	}
 	next := 0
 	for _, vm := range vms {
-		if index, ok := parseTapPoolIndex(vm.Runtime.TapDevice); ok && index >= next {
+		if index, ok := parseTapPoolIndex(d.vmHandles(vm.ID).TapDevice); ok && index >= next {
 			next = index + 1
 		}
 	}
diff --git a/internal/daemon/vm.go b/internal/daemon/vm.go
index bf0d8ac..6c4ed35 100644
--- a/internal/daemon/vm.go
+++ b/internal/daemon/vm.go
@@ -85,7 +85,8 @@ func (d *Daemon) cleanupRuntime(ctx context.Context, vm model.VMRecord, preserve
 	if d.logger != nil {
 		d.logger.Debug("cleanup runtime", append(vmLogAttrs(vm), "preserve_disks", preserveDisks)...)
 	}
-	cleanupPID := vm.Runtime.PID
+	h := d.vmHandles(vm.ID)
+	cleanupPID := h.PID
 	if vm.Runtime.APISockPath != "" {
 		if pid, err := d.findFirecrackerPID(ctx, vm.Runtime.APISockPath); err == nil && pid > 0 {
 			cleanupPID = pid
@@ -98,15 +99,15 @@ func (d *Daemon) cleanupRuntime(ctx context.Context, vm model.VMRecord, preserve
 		}
 	}
 	snapshotErr := d.cleanupDMSnapshot(ctx, dmSnapshotHandles{
-		BaseLoop: vm.Runtime.BaseLoop,
-		COWLoop:  vm.Runtime.COWLoop,
-		DMName:   vm.Runtime.DMName,
-		DMDev:    vm.Runtime.DMDev,
+		BaseLoop: h.BaseLoop,
+		COWLoop:  h.COWLoop,
+		DMName:   h.DMName,
+		DMDev:    h.DMDev,
 	})
 	featureErr := d.cleanupCapabilityState(ctx, vm)
 	var tapErr error
-	if vm.Runtime.TapDevice != "" {
-		tapErr = d.releaseTap(ctx, vm.Runtime.TapDevice)
+	if h.TapDevice != "" {
+		tapErr = d.releaseTap(ctx, h.TapDevice)
 	}
 	if vm.Runtime.APISockPath != "" {
 		_ = os.Remove(vm.Runtime.APISockPath)
@@ -114,22 +115,16 @@ func (d *Daemon) cleanupRuntime(ctx context.Context, vm model.VMRecord, preserve
 	if vm.Runtime.VSockPath != "" {
 		_ = os.Remove(vm.Runtime.VSockPath)
 	}
+	// The handles are only meaningful while the kernel objects exist;
+	// dropping them here keeps the cache in sync with reality even
+	// when the caller forgets to call clearVMHandles explicitly.
+	d.clearVMHandles(vm)
 	if !preserveDisks && vm.Runtime.VMDir != "" {
 		return errors.Join(snapshotErr, featureErr, tapErr, os.RemoveAll(vm.Runtime.VMDir))
 	}
 	return errors.Join(snapshotErr, featureErr, tapErr)
 }
 
-func clearRuntimeHandles(vm *model.VMRecord) {
-	vm.Runtime.PID = 0
-	vm.Runtime.APISockPath = ""
-	vm.Runtime.TapDevice = ""
-	vm.Runtime.BaseLoop = ""
-	vm.Runtime.COWLoop = ""
-	vm.Runtime.DMName = ""
-	vm.Runtime.DMDev = ""
-}
-
 func defaultVSockPath(runtimeDir, vmID string) string {
 	return filepath.Join(runtimeDir, "fc-"+system.ShortID(vmID)+".vsock")
 }
@@ -205,10 +200,7 @@ func (d *Daemon) rebuildDNS(ctx context.Context) error {
 	}
 	records := make(map[string]string)
 	for _, vm := range vms {
-		if vm.State != model.VMStateRunning {
-			continue
-		}
-		if !system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
+		if !d.vmAlive(vm) {
 			continue
 		}
 		if strings.TrimSpace(vm.Runtime.GuestIP) == "" {
diff --git a/internal/daemon/vm_disk.go b/internal/daemon/vm_disk.go
index a5df6e7..4033f25 100644
--- a/internal/daemon/vm_disk.go
+++ b/internal/daemon/vm_disk.go
@@ -26,12 +26,20 @@ func (d *Daemon) ensureSystemOverlay(ctx context.Context, vm *model.VMRecord) er
 	return err
 }
 
+// patchRootOverlay writes the per-VM config files (resolv.conf,
+// hostname, hosts, sshd drop-in, network bootstrap, fstab) into the
+// rootfs overlay. Reads the DM device path from the handle cache,
+// which the start flow populates before calling this.
 func (d *Daemon) patchRootOverlay(ctx context.Context, vm model.VMRecord, image model.Image) error {
+	dmDev := d.vmHandles(vm.ID).DMDev
+	if dmDev == "" {
+		return fmt.Errorf("vm %q: DM device not in handle cache — start flow out of order?", vm.ID)
+	}
 	resolv := []byte(fmt.Sprintf("nameserver %s\n", d.config.DefaultDNS))
 	hostname := []byte(vm.Name + "\n")
 	hosts := []byte(fmt.Sprintf("127.0.0.1 localhost\n127.0.1.1 %s\n", vm.Name))
 	sshdConfig := []byte(sshdGuestConfig())
-	fstab, err := system.ReadDebugFSText(ctx, d.runner, vm.Runtime.DMDev, "/etc/fstab")
+	fstab, err := system.ReadDebugFSText(ctx, d.runner, dmDev, "/etc/fstab")
 	if err != nil {
 		fstab = ""
 	}
@@ -66,12 +74,12 @@ func (d *Daemon) patchRootOverlay(ctx context.Context, vm model.VMRecord, image
 	for _, guestPath := range builder.FilePaths() {
 		data := files[guestPath]
 		if guestPath == guestnet.GuestScriptPath {
-			if err := system.WriteExt4FileMode(ctx, d.runner, vm.Runtime.DMDev, guestPath, 0o755, data); err != nil {
+			if err := system.WriteExt4FileMode(ctx, d.runner, dmDev, guestPath, 0o755, data); err != nil {
 				return err
 			}
 			continue
 		}
-		if err := system.WriteExt4File(ctx, d.runner, vm.Runtime.DMDev, guestPath, data); err != nil {
+		if err := system.WriteExt4File(ctx, d.runner, dmDev, guestPath, data); err != nil {
 			return err
 		}
 	}
@@ -109,7 +117,11 @@ func (d *Daemon) ensureWorkDisk(ctx context.Context, vm *model.VMRecord, image m
 	if _, err := d.runner.Run(ctx, "mkfs.ext4", "-F", vm.Runtime.WorkDiskPath); err != nil {
 		return workDiskPreparation{}, err
 	}
-	rootMount, cleanupRoot, err := system.MountTempDir(ctx, d.runner, vm.Runtime.DMDev, true)
+	dmDev := d.vmHandles(vm.ID).DMDev
+	if dmDev == "" {
+		return workDiskPreparation{}, fmt.Errorf("vm %q: DM device not in handle cache", vm.ID)
+	}
+	rootMount, cleanupRoot, err := system.MountTempDir(ctx, d.runner, dmDev, true)
 	if err != nil {
 		return workDiskPreparation{}, err
 	}
diff --git a/internal/daemon/vm_handles.go b/internal/daemon/vm_handles.go
new file mode 100644
index 0000000..ef367c4
--- /dev/null
+++ b/internal/daemon/vm_handles.go
@@ -0,0 +1,211 @@
+package daemon
+
+import (
+	"context"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"os"
+	"path/filepath"
+	"sync"
+
+	"banger/internal/model"
+	"banger/internal/system"
+)
+
+// handleCache is the daemon's in-memory map of per-VM transient
+// handles. It is the sole runtime source of truth for PID / tap /
+// loop / DM state — persistent storage (the per-VM handles.json
+// scratch file) exists only so the daemon can rebuild the cache
+// after a restart.
+type handleCache struct {
+	mu sync.RWMutex
+	m  map[string]model.VMHandles
+}
+
+func newHandleCache() *handleCache {
+	return &handleCache{m: make(map[string]model.VMHandles)}
+}
+
+// get returns the cached handles for vmID and whether an entry
+// exists. A missing entry means "no live handles tracked," which is
+// the correct state for stopped VMs.
+func (c *handleCache) get(vmID string) (model.VMHandles, bool) {
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+	h, ok := c.m[vmID]
+	return h, ok
+}
+
+func (c *handleCache) set(vmID string, h model.VMHandles) {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	c.m[vmID] = h
+}
+
+func (c *handleCache) clear(vmID string) {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	delete(c.m, vmID)
+}
+
+// handlesFilePath returns the scratch file path inside the VM
+// directory where the daemon writes the last-known handles.
+func handlesFilePath(vmDir string) string {
+	return filepath.Join(vmDir, "handles.json")
+}
+
+// writeHandlesFile persists h to <vmDir>/handles.json. Called
+// whenever the daemon successfully transitions a VM to running
+// (after all handles are acquired). Best-effort: a write failure is
+// logged, not propagated — the in-memory cache is authoritative
+// while the daemon is up.
+func writeHandlesFile(vmDir string, h model.VMHandles) error {
+	if vmDir == "" {
+		return errors.New("vm dir is required")
+	}
+	if err := os.MkdirAll(vmDir, 0o755); err != nil {
+		return err
+	}
+	data, err := json.MarshalIndent(h, "", "  ")
+	if err != nil {
+		return err
+	}
+	return os.WriteFile(handlesFilePath(vmDir), data, 0o600)
+}
+
+// readHandlesFile loads the scratch file written at the last start.
+// Returns a zero-value handles + (false, nil) if the file doesn't
+// exist — that's the normal case for stopped VMs.
+func readHandlesFile(vmDir string) (model.VMHandles, bool, error) {
+	if vmDir == "" {
+		return model.VMHandles{}, false, nil
+	}
+	data, err := os.ReadFile(handlesFilePath(vmDir))
+	if os.IsNotExist(err) {
+		return model.VMHandles{}, false, nil
+	}
+	if err != nil {
+		return model.VMHandles{}, false, err
+	}
+	var h model.VMHandles
+	if err := json.Unmarshal(data, &h); err != nil {
+		return model.VMHandles{}, false, fmt.Errorf("parse handles.json: %w", err)
+	}
+	return h, true, nil
+}
+
+func removeHandlesFile(vmDir string) {
+	if vmDir == "" {
+		return
+	}
+	_ = os.Remove(handlesFilePath(vmDir))
+}
+
+// ensureHandleCache lazily constructs the cache so direct
+// `&Daemon{}` literals (common in tests) don't have to initialise
+// it. Production code goes through Open(), which also builds it.
+func (d *Daemon) ensureHandleCache() {
+	if d.handles == nil {
+		d.handles = newHandleCache()
+	}
+}
+
+// setVMHandlesInMemory is a test-only cache seed that skips the
+// scratch-file write. Production callers should use setVMHandles so
+// the filesystem survives a daemon restart.
+func (d *Daemon) setVMHandlesInMemory(vmID string, h model.VMHandles) {
+	if d == nil {
+		return
+	}
+	d.ensureHandleCache()
+	d.handles.set(vmID, h)
+}
+
+// vmHandles returns the cached handles for vm (zero-value if no
+// entry). Call sites that previously read `vm.Runtime.{PID,...}`
+// should read through this instead.
+func (d *Daemon) vmHandles(vmID string) model.VMHandles {
+	if d == nil {
+		return model.VMHandles{}
+	}
+	d.ensureHandleCache()
+	h, _ := d.handles.get(vmID)
+	return h
+}
+
+// setVMHandles updates the in-memory cache AND the per-VM scratch
+// file. Scratch-file errors are logged but not returned; the cache
+// write is authoritative while the daemon is alive.
+func (d *Daemon) setVMHandles(vm model.VMRecord, h model.VMHandles) {
+	if d == nil {
+		return
+	}
+	d.ensureHandleCache()
+	d.handles.set(vm.ID, h)
+	if err := writeHandlesFile(vm.Runtime.VMDir, h); err != nil && d.logger != nil {
+		d.logger.Warn("persist handles.json failed", "vm_id", vm.ID, "error", err.Error())
+	}
+}
+
+// clearVMHandles drops the cache entry and removes the scratch
+// file. Called on stop / delete / after a failed start.
+func (d *Daemon) clearVMHandles(vm model.VMRecord) {
+	if d == nil {
+		return
+	}
+	d.ensureHandleCache()
+	d.handles.clear(vm.ID)
+	removeHandlesFile(vm.Runtime.VMDir)
+}
+
+// vmAlive is the canonical "is this VM actually running?" check.
+// Unlike the old `system.ProcessRunning(vm.Runtime.PID, apiSock)`
+// pattern, this reads the PID from the handle cache — which is
+// authoritative in-process — and verifies the PID against the api
+// socket so a recycled PID can't false-positive.
+func (d *Daemon) vmAlive(vm model.VMRecord) bool {
+	if vm.State != model.VMStateRunning {
+		return false
+	}
+	h := d.vmHandles(vm.ID)
+	if h.PID <= 0 {
+		return false
+	}
+	return system.ProcessRunning(h.PID, vm.Runtime.APISockPath)
+}
+
+// rediscoverHandles loads what the last daemon start knew about a VM
+// from its handles.json scratch file and verifies the firecracker
+// process is still alive. Returns:
+//
+//   - handles: the scratch-file contents (zero-value if no file).
+//     ALWAYS returned, even when alive=false, because the caller
+//     needs them to tear down kernel state (dm-snapshot, loops, tap)
+//     that the previous daemon left behind when it died.
+//   - alive: true iff a firecracker process matching the api sock is
+//     currently running.
+//   - err: unexpected failure (file exists but is corrupt).
+//
+// Strategy: pgrep by api sock path first (handles the case where
+// the daemon crashed but the PID changed on respawn — unlikely for
+// firecracker, but cheap insurance); fall back to verifying the
+// scratch file's PID directly.
+func (d *Daemon) rediscoverHandles(ctx context.Context, vm model.VMRecord) (model.VMHandles, bool, error) {
+	saved, _, err := readHandlesFile(vm.Runtime.VMDir)
+	if err != nil {
+		return model.VMHandles{}, false, err
+	}
+	apiSock := vm.Runtime.APISockPath
+	if apiSock == "" {
+		return saved, false, nil
+	}
+	if pid, pidErr := d.findFirecrackerPID(ctx, apiSock); pidErr == nil && pid > 0 {
+		saved.PID = pid
+		return saved, true, nil
+	}
+	if saved.PID > 0 && system.ProcessRunning(saved.PID, apiSock) {
+		return saved, true, nil
+	}
+	return saved, false, nil
+}
diff --git a/internal/daemon/vm_handles_test.go b/internal/daemon/vm_handles_test.go
new file mode 100644
index 0000000..af170de
--- /dev/null
+++ b/internal/daemon/vm_handles_test.go
@@ -0,0 +1,197 @@
+package daemon
+
+import (
+	"context"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+
+	"banger/internal/model"
+)
+
+func TestHandlesFileRoundtrip(t *testing.T) {
+	t.Parallel()
+	dir := t.TempDir()
+	want := model.VMHandles{
+		PID:       4242,
+		TapDevice: "tap-fc-abcd",
+		BaseLoop:  "/dev/loop9",
+		COWLoop:   "/dev/loop10",
+		DMName:    "fc-rootfs-abcd",
+		DMDev:     "/dev/mapper/fc-rootfs-abcd",
+	}
+	if err := writeHandlesFile(dir, want); err != nil {
+		t.Fatalf("writeHandlesFile: %v", err)
+	}
+	got, present, err := readHandlesFile(dir)
+	if err != nil {
+		t.Fatalf("readHandlesFile: %v", err)
+	}
+	if !present {
+		t.Fatal("readHandlesFile reported no file after write")
+	}
+	if got != want {
+		t.Fatalf("roundtrip mismatch:\n got  %+v\n want %+v", got, want)
+	}
+}
+
+func TestHandlesFileMissingReturnsZero(t *testing.T) {
+	t.Parallel()
+	h, present, err := readHandlesFile(t.TempDir())
+	if err != nil {
+		t.Fatalf("readHandlesFile (missing): %v", err)
+	}
+	if present {
+		t.Fatal("present = true for missing file")
+	}
+	if !h.IsZero() {
+		t.Fatalf("expected zero-value handles, got %+v", h)
+	}
+}
+
+func TestHandlesFileCorruptReturnsError(t *testing.T) {
+	t.Parallel()
+	dir := t.TempDir()
+	if err := os.WriteFile(handlesFilePath(dir), []byte("{not json"), 0o600); err != nil {
+		t.Fatalf("WriteFile: %v", err)
+	}
+	if _, _, err := readHandlesFile(dir); err == nil {
+		t.Fatal("expected parse error for corrupt file")
+	}
+}
+
+func TestHandleCacheConcurrent(t *testing.T) {
+	t.Parallel()
+	c := newHandleCache()
+	done := make(chan struct{})
+	// One writer, multiple readers — prove the RWMutex usage.
+	go func() {
+		for i := 0; i < 1000; i++ {
+			c.set("vm-1", model.VMHandles{PID: i})
+		}
+		close(done)
+	}()
+	for i := 0; i < 1000; i++ {
+		_, _ = c.get("vm-1")
+	}
+	<-done
+	c.clear("vm-1")
+	if _, ok := c.get("vm-1"); ok {
+		t.Fatal("cache entry still present after clear")
+	}
+}
+
+// TestRediscoverHandlesLoadsScratchWhenProcessDead proves the stale-
+// cleanup path: the firecracker process is gone, but the scratch
+// file tells us which kernel resources the previous daemon still
+// owes us a teardown on.
+func TestRediscoverHandlesLoadsScratchWhenProcessDead(t *testing.T) {
+	t.Parallel()
+
+	vmDir := t.TempDir()
+	apiSock := filepath.Join(t.TempDir(), "fc.sock")
+	stale := model.VMHandles{
+		PID:      999999,
+		BaseLoop: "/dev/loop99",
+		COWLoop:  "/dev/loop100",
+		DMName:   "fc-rootfs-gone",
+		DMDev:    "/dev/mapper/fc-rootfs-gone",
+	}
+	if err := writeHandlesFile(vmDir, stale); err != nil {
+		t.Fatalf("writeHandlesFile: %v", err)
+	}
+
+	// A scripted runner that reports "no such process" when reconcile
+	// probes via pgrep.
+	runner := &scriptedRunner{
+		t: t,
+		steps: []runnerStep{
+			{call: runnerCall{name: "pgrep", args: []string{"-n", "-f", apiSock}}, err: &exitErr{code: 1}},
+		},
+	}
+	d := &Daemon{runner: runner}
+	vm := testVM("gone", "image-gone", "172.16.0.250")
+	vm.Runtime.APISockPath = apiSock
+	vm.Runtime.VMDir = vmDir
+
+	got, alive, err := d.rediscoverHandles(context.Background(), vm)
+	if err != nil {
+		t.Fatalf("rediscoverHandles: %v", err)
+	}
+	if alive {
+		t.Fatal("alive = true, want false (process dead)")
+	}
+	// Even when dead, the scratch handles must be returned so
+	// cleanupRuntime can tear DM + loops + tap down.
+	if got.DMName != stale.DMName || got.BaseLoop != stale.BaseLoop || got.COWLoop != stale.COWLoop {
+		t.Fatalf("stale handles lost: got %+v, want fields from %+v", got, stale)
+	}
+	runner.assertExhausted()
+}
+
+// TestRediscoverHandlesPrefersLivePIDOverScratch: scratch file has an
+// old PID, but pgrep finds the actual current PID via the api sock.
+func TestRediscoverHandlesPrefersLivePIDOverScratch(t *testing.T) {
+	t.Parallel()
+
+	vmDir := t.TempDir()
+	apiSock := filepath.Join(t.TempDir(), "fc.sock")
+	if err := writeHandlesFile(vmDir, model.VMHandles{PID: 111, DMName: "dm-x"}); err != nil {
+		t.Fatalf("writeHandlesFile: %v", err)
+	}
+
+	runner := &scriptedRunner{
+		t: t,
+		steps: []runnerStep{
+			{call: runnerCall{name: "pgrep", args: []string{"-n", "-f", apiSock}}, out: []byte("222\n")},
+		},
+	}
+	d := &Daemon{runner: runner}
+	vm := testVM("moved", "image-moved", "172.16.0.251")
+	vm.Runtime.APISockPath = apiSock
+	vm.Runtime.VMDir = vmDir
+
+	got, alive, err := d.rediscoverHandles(context.Background(), vm)
+	if err != nil {
+		t.Fatalf("rediscoverHandles: %v", err)
+	}
+	if !alive {
+		t.Fatal("alive = false, want true (pgrep found a PID)")
+	}
+	if got.PID != 222 {
+		t.Fatalf("PID = %d, want 222 (from pgrep, not scratch)", got.PID)
+	}
+	if got.DMName != "dm-x" {
+		t.Fatalf("scratch fields dropped: %+v", got)
+	}
+	runner.assertExhausted()
+}
+
+// TestClearVMHandlesRemovesScratchFile proves the cleanup contract.
+func TestClearVMHandlesRemovesScratchFile(t *testing.T) {
+	t.Parallel()
+	vmDir := t.TempDir()
+	if err := writeHandlesFile(vmDir, model.VMHandles{PID: 42}); err != nil {
+		t.Fatalf("writeHandlesFile: %v", err)
+	}
+
+	d := &Daemon{}
+	vm := testVM("sweep", "image-sweep", "172.16.0.252")
+	vm.Runtime.VMDir = vmDir
+	d.setVMHandlesInMemory(vm.ID, model.VMHandles{PID: 42})
+	d.clearVMHandles(vm)
+
+	if _, err := os.Stat(handlesFilePath(vmDir)); !os.IsNotExist(err) {
+		t.Fatalf("scratch file still present: %v", err)
+	}
+	if h, ok := d.handles.get(vm.ID); ok && !h.IsZero() {
+		t.Fatalf("cache entry survives clear: %+v", h)
+	}
+}
+
+// exitErr is a minimal stand-in for an exec-style non-zero exit.
+// Used by scripted runners to simulate "pgrep found nothing".
+type exitErr struct{ code int }
+
+func (e *exitErr) Error() string { return "exit status " + strings.Repeat("1", 1) }
diff --git a/internal/daemon/vm_lifecycle.go b/internal/daemon/vm_lifecycle.go
index 17713da..ed1750e 100644
--- a/internal/daemon/vm_lifecycle.go
+++ b/internal/daemon/vm_lifecycle.go
@@ -22,7 +22,7 @@ func (d *Daemon) StartVM(ctx context.Context, idOrName string) (model.VMRecord,
 		if err != nil {
 			return model.VMRecord{}, err
 		}
-		if vm.State == model.VMStateRunning && system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
+		if d.vmAlive(vm) {
 			if d.logger != nil {
 				d.logger.Info("vm already running", vmLogAttrs(vm)...)
 			}
@@ -54,7 +54,7 @@ func (d *Daemon) startVMLocked(ctx context.Context, vm model.VMRecord, image mod
 	if err := d.cleanupRuntime(ctx, vm, true); err != nil {
 		return model.VMRecord{}, err
 	}
-	clearRuntimeHandles(&vm)
+	d.clearVMHandles(vm)
 	op.stage("bridge")
 	if err := d.ensureBridge(ctx); err != nil {
 		return model.VMRecord{}, err
@@ -92,14 +92,23 @@ func (d *Daemon) startVMLocked(ctx context.Context, vm model.VMRecord, image mod
 
 	op.stage("dm_snapshot", "dm_name", dmName)
 	vmCreateStage(ctx, "prepare_rootfs", "creating root filesystem snapshot")
-	handles, err := d.createDMSnapshot(ctx, image.RootfsPath, vm.Runtime.SystemOverlay, dmName)
+	snapHandles, err := d.createDMSnapshot(ctx, image.RootfsPath, vm.Runtime.SystemOverlay, dmName)
 	if err != nil {
 		return model.VMRecord{}, err
 	}
-	vm.Runtime.BaseLoop = handles.BaseLoop
-	vm.Runtime.COWLoop = handles.COWLoop
-	vm.Runtime.DMName = handles.DMName
-	vm.Runtime.DMDev = handles.DMDev
+	// Live handles are threaded through this function as a local and
+	// pushed to the cache via setVMHandles once we have every piece.
+	// The cache update must happen BEFORE any step that reads handles
+	// back (e.g. cleanupRuntime via cleanupOnErr) — otherwise loops
+	// and DM would leak on an early failure.
+	live := model.VMHandles{
+		BaseLoop: snapHandles.BaseLoop,
+		COWLoop:  snapHandles.COWLoop,
+		DMName:   snapHandles.DMName,
+		DMDev:    snapHandles.DMDev,
+	}
+	d.setVMHandles(vm, live)
+
 	vm.Runtime.APISockPath = apiSock
 	vm.Runtime.State = model.VMStateRunning
 	vm.State = model.VMStateRunning
@@ -113,7 +122,7 @@ func (d *Daemon) startVMLocked(ctx context.Context, vm model.VMRecord, image mod
 		if cleanupErr := d.cleanupRuntime(context.Background(), vm, true); cleanupErr != nil {
 			err = errors.Join(err, cleanupErr)
 		}
-		clearRuntimeHandles(&vm)
+		d.clearVMHandles(vm)
 		_ = d.store.UpsertVM(context.Background(), vm)
 		return model.VMRecord{}, err
 	}
@@ -133,7 +142,8 @@ func (d *Daemon) startVMLocked(ctx context.Context, vm model.VMRecord, image mod
 	if err != nil {
 		return cleanupOnErr(err)
 	}
-	vm.Runtime.TapDevice = tap
+	live.TapDevice = tap
+	d.setVMHandles(vm, live)
 	op.stage("metrics_file", "metrics_path", vm.Runtime.MetricsPath)
 	if err := os.WriteFile(vm.Runtime.MetricsPath, nil, 0o644); err != nil {
 		return cleanupOnErr(err)
@@ -170,7 +180,7 @@ func (d *Daemon) startVMLocked(ctx context.Context, vm model.VMRecord, image mod
 		KernelArgs:      kernelArgs,
 		Drives: []firecracker.DriveConfig{{
 			ID:       "rootfs",
-			Path:     vm.Runtime.DMDev,
+			Path:     live.DMDev,
 			ReadOnly: false,
 			IsRoot:   true,
 		}},
@@ -190,11 +200,13 @@ func (d *Daemon) startVMLocked(ctx context.Context, vm model.VMRecord, image mod
 		// Use a fresh context: the request ctx may already be cancelled (client
 		// disconnect), but we still need the PID so cleanupRuntime can kill the
 		// Firecracker process that was spawned before the failure.
-		vm.Runtime.PID = d.resolveFirecrackerPID(context.Background(), machine, apiSock)
+		live.PID = d.resolveFirecrackerPID(context.Background(), machine, apiSock)
+		d.setVMHandles(vm, live)
 		return cleanupOnErr(err)
 	}
-	vm.Runtime.PID = d.resolveFirecrackerPID(context.Background(), machine, apiSock)
-	op.debugStage("firecracker_started", "pid", vm.Runtime.PID)
+	live.PID = d.resolveFirecrackerPID(context.Background(), machine, apiSock)
+	d.setVMHandles(vm, live)
+	op.debugStage("firecracker_started", "pid", live.PID)
 	op.stage("socket_access", "api_socket", apiSock)
 	if err := d.ensureSocketAccess(ctx, apiSock, "firecracker api socket"); err != nil {
 		return cleanupOnErr(err)
@@ -237,29 +249,30 @@ func (d *Daemon) stopVMLocked(ctx context.Context, current model.VMRecord) (vm m
 		}
 		op.done(vmLogAttrs(vm)...)
 	}()
-	if vm.State != model.VMStateRunning || !system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
+	if !d.vmAlive(vm) {
 		op.stage("cleanup_stale_runtime")
 		if err := d.cleanupRuntime(ctx, vm, true); err != nil {
 			return model.VMRecord{}, err
 		}
 		vm.State = model.VMStateStopped
 		vm.Runtime.State = model.VMStateStopped
-		clearRuntimeHandles(&vm)
+		d.clearVMHandles(vm)
 		if err := d.store.UpsertVM(ctx, vm); err != nil {
 			return model.VMRecord{}, err
 		}
 		return vm, nil
 	}
+	pid := d.vmHandles(vm.ID).PID
 	op.stage("graceful_shutdown")
 	if err := d.sendCtrlAltDel(ctx, vm); err != nil {
 		return model.VMRecord{}, err
 	}
-	op.stage("wait_for_exit", "pid", vm.Runtime.PID)
-	if err := d.waitForExit(ctx, vm.Runtime.PID, vm.Runtime.APISockPath, gracefulShutdownWait); err != nil {
+	op.stage("wait_for_exit", "pid", pid)
+	if err := d.waitForExit(ctx, pid, vm.Runtime.APISockPath, gracefulShutdownWait); err != nil {
 		if !errors.Is(err, errWaitForExitTimeout) {
 			return model.VMRecord{}, err
 		}
-		op.stage("graceful_shutdown_timeout", "pid", vm.Runtime.PID)
+		op.stage("graceful_shutdown_timeout", "pid", pid)
 	}
 	op.stage("cleanup_runtime")
 	if err := d.cleanupRuntime(ctx, vm, true); err != nil {
@@ -267,7 +280,7 @@ func (d *Daemon) stopVMLocked(ctx context.Context, current model.VMRecord) (vm m
 	}
 	vm.State = model.VMStateStopped
 	vm.Runtime.State = model.VMStateStopped
-	clearRuntimeHandles(&vm)
+	d.clearVMHandles(vm)
 	system.TouchNow(&vm)
 	if err := d.store.UpsertVM(ctx, vm); err != nil {
 		return model.VMRecord{}, err
@@ -291,14 +304,14 @@ func (d *Daemon) killVMLocked(ctx context.Context, current model.VMRecord, signa
 		}
 		op.done(vmLogAttrs(vm)...)
 	}()
-	if vm.State != model.VMStateRunning || !system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
+	if !d.vmAlive(vm) {
 		op.stage("cleanup_stale_runtime")
 		if err := d.cleanupRuntime(ctx, vm, true); err != nil {
 			return model.VMRecord{}, err
 		}
 		vm.State = model.VMStateStopped
 		vm.Runtime.State = model.VMStateStopped
-		clearRuntimeHandles(&vm)
+		d.clearVMHandles(vm)
 		if err := d.store.UpsertVM(ctx, vm); err != nil {
 			return model.VMRecord{}, err
 		}
@@ -309,16 +322,17 @@ func (d *Daemon) killVMLocked(ctx context.Context, current model.VMRecord, signa
 	if signal == "" {
 		signal = "TERM"
 	}
-	op.stage("send_signal", "pid", vm.Runtime.PID, "signal", signal)
-	if _, err := d.runner.RunSudo(ctx, "kill", "-"+signal, strconv.Itoa(vm.Runtime.PID)); err != nil {
+	pid := d.vmHandles(vm.ID).PID
+	op.stage("send_signal", "pid", pid, "signal", signal)
+	if _, err := d.runner.RunSudo(ctx, "kill", "-"+signal, strconv.Itoa(pid)); err != nil {
 		return model.VMRecord{}, err
 	}
-	op.stage("wait_for_exit", "pid", vm.Runtime.PID)
-	if err := d.waitForExit(ctx, vm.Runtime.PID, vm.Runtime.APISockPath, 30*time.Second); err != nil {
+	op.stage("wait_for_exit", "pid", pid)
+	if err := d.waitForExit(ctx, pid, vm.Runtime.APISockPath, 30*time.Second); err != nil {
 		if !errors.Is(err, errWaitForExitTimeout) {
 			return model.VMRecord{}, err
 		}
-		op.stage("signal_timeout", "pid", vm.Runtime.PID, "signal", signal)
+		op.stage("signal_timeout", "pid", pid, "signal", signal)
 	}
 	op.stage("cleanup_runtime")
 	if err := d.cleanupRuntime(ctx, vm, true); err != nil {
@@ -326,7 +340,7 @@ func (d *Daemon) killVMLocked(ctx context.Context, current model.VMRecord, signa
 	}
 	vm.State = model.VMStateStopped
 	vm.Runtime.State = model.VMStateStopped
-	clearRuntimeHandles(&vm)
+	d.clearVMHandles(vm)
 	system.TouchNow(&vm)
 	if err := d.store.UpsertVM(ctx, vm); err != nil {
 		return model.VMRecord{}, err
@@ -378,9 +392,10 @@ func (d *Daemon) deleteVMLocked(ctx context.Context, current model.VMRecord) (vm
 		}
 		op.done(vmLogAttrs(vm)...)
 	}()
-	if vm.State == model.VMStateRunning && system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
-		op.stage("kill_running_vm", "pid", vm.Runtime.PID)
-		_ = d.killVMProcess(ctx, vm.Runtime.PID)
+	if d.vmAlive(vm) {
+		pid := d.vmHandles(vm.ID).PID
+		op.stage("kill_running_vm", "pid", pid)
+		_ = d.killVMProcess(ctx, pid)
 	}
 	op.stage("cleanup_runtime")
 	if err := d.cleanupRuntime(ctx, vm, false); err != nil {
diff --git a/internal/daemon/vm_set.go b/internal/daemon/vm_set.go
index 5ffae29..977991b 100644
--- a/internal/daemon/vm_set.go
+++ b/internal/daemon/vm_set.go
@@ -25,7 +25,7 @@ func (d *Daemon) setVMLocked(ctx context.Context, current model.VMRecord, params
 		}
 		op.done(vmLogAttrs(vm)...)
 	}()
-	running := vm.State == model.VMStateRunning && system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath)
+	running := d.vmAlive(vm)
 	if params.VCPUCount != nil {
 		if err := validateOptionalPositiveSetting("vcpu", params.VCPUCount); err != nil {
 			return model.VMRecord{}, err
diff --git a/internal/daemon/vm_stats.go b/internal/daemon/vm_stats.go
index 9d49043..d917150 100644
--- a/internal/daemon/vm_stats.go
+++ b/internal/daemon/vm_stats.go
@@ -25,7 +25,7 @@ func (d *Daemon) GetVMStats(ctx context.Context, idOrName string) (model.VMRecor
 func (d *Daemon) HealthVM(ctx context.Context, idOrName string) (result api.VMHealthResult, err error) {
 	_, err = d.withVMLockByRef(ctx, idOrName, func(vm model.VMRecord) (model.VMRecord, error) {
 		result.Name = vm.Name
-		if vm.State != model.VMStateRunning || !system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
+		if !d.vmAlive(vm) {
 			result.Healthy = false
 			return vm, nil
 		}
@@ -77,7 +77,7 @@ func (d *Daemon) pollStats(ctx context.Context) error {
 	}
 	for _, vm := range vms {
 		if err := d.withVMLockByIDErr(ctx, vm.ID, func(vm model.VMRecord) error {
-			if vm.State != model.VMStateRunning || !system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
+			if !d.vmAlive(vm) {
 				return nil
 			}
 			stats, err := d.collectStats(ctx, vm)
@@ -116,7 +116,7 @@ func (d *Daemon) stopStaleVMs(ctx context.Context) (err error) {
 	now := model.Now()
 	for _, vm := range vms {
 		if err := d.withVMLockByIDErr(ctx, vm.ID, func(vm model.VMRecord) error {
-			if vm.State != model.VMStateRunning || !system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
+			if !d.vmAlive(vm) {
 				return nil
 			}
 			if now.Sub(vm.LastTouchedAt) < d.config.AutoStopStaleAfter {
@@ -124,11 +124,11 @@ func (d *Daemon) stopStaleVMs(ctx context.Context) (err error) {
 			}
 			op.stage("stopping_vm", vmLogAttrs(vm)...)
 			_ = d.sendCtrlAltDel(ctx, vm)
-			_ = d.waitForExit(ctx, vm.Runtime.PID, vm.Runtime.APISockPath, 10*time.Second)
+			_ = d.waitForExit(ctx, d.vmHandles(vm.ID).PID, vm.Runtime.APISockPath, 10*time.Second)
 			_ = d.cleanupRuntime(ctx, vm, true)
 			vm.State = model.VMStateStopped
 			vm.Runtime.State = model.VMStateStopped
-			clearRuntimeHandles(&vm)
+			d.clearVMHandles(vm)
 			vm.UpdatedAt = model.Now()
 			return d.store.UpsertVM(ctx, vm)
 		}); err != nil {
@@ -145,9 +145,8 @@ func (d *Daemon) collectStats(ctx context.Context, vm model.VMRecord) (model.VMS
 		WorkDiskBytes:      system.AllocatedBytes(vm.Runtime.WorkDiskPath),
 		MetricsRaw:         system.ParseMetricsFile(vm.Runtime.MetricsPath),
 	}
-	if vm.Runtime.PID > 0 && system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
-		ps, err := system.ReadProcessStats(ctx, vm.Runtime.PID)
-		if err == nil {
+	if d.vmAlive(vm) {
+		if ps, err := system.ReadProcessStats(ctx, d.vmHandles(vm.ID).PID); err == nil {
 			stats.CPUPercent = ps.CPUPercent
 			stats.RSSBytes = ps.RSSBytes
 			stats.VSZBytes = ps.VSZBytes
diff --git a/internal/daemon/vm_test.go b/internal/daemon/vm_test.go
index 26dbf98..c6ae796 100644
--- a/internal/daemon/vm_test.go
+++ b/internal/daemon/vm_test.go
@@ -112,21 +112,36 @@ func TestReconcileStopsStaleRunningVMAndClearsRuntimeHandles(t *testing.T) {
 	if err := os.WriteFile(apiSock, []byte{}, 0o644); err != nil {
 		t.Fatalf("WriteFile(api sock): %v", err)
 	}
+	vmDir := t.TempDir()
 	vm := testVM("stale", "image-stale", "172.16.0.9")
 	vm.State = model.VMStateRunning
 	vm.Runtime.State = model.VMStateRunning
-	vm.Runtime.PID = 999999
 	vm.Runtime.APISockPath = apiSock
-	vm.Runtime.DMName = "fc-rootfs-stale"
-	vm.Runtime.DMDev = "/dev/mapper/fc-rootfs-stale"
-	vm.Runtime.COWLoop = "/dev/loop11"
-	vm.Runtime.BaseLoop = "/dev/loop10"
+	vm.Runtime.VMDir = vmDir
 	vm.Runtime.DNSName = ""
 	upsertDaemonVM(t, ctx, db, vm)
 
+	// Simulate the prior daemon crashing while this VM was running:
+	// the handles.json scratch file survives and names a stale PID +
+	// DM snapshot. Reconcile should discover the PID is gone, tear
+	// the kernel state down via the runner, and clear the scratch.
+	stale := model.VMHandles{
+		PID:      999999,
+		BaseLoop: "/dev/loop10",
+		COWLoop:  "/dev/loop11",
+		DMName:   "fc-rootfs-stale",
+		DMDev:    "/dev/mapper/fc-rootfs-stale",
+	}
+	if err := writeHandlesFile(vmDir, stale); err != nil {
+		t.Fatalf("writeHandlesFile: %v", err)
+	}
+
 	runner := &scriptedRunner{
 		t: t,
 		steps: []runnerStep{
+			// First pgrep: rediscoverHandles tries to verify the PID.
+			{call: runnerCall{name: "pgrep", args: []string{"-n", "-f", apiSock}}, err: errors.New("exit status 1")},
+			// Second pgrep: cleanupRuntime asks again before killing.
 			{call: runnerCall{name: "pgrep", args: []string{"-n", "-f", apiSock}}, err: errors.New("exit status 1")},
 			sudoStep("", nil, "dmsetup", "remove", "fc-rootfs-stale"),
 			sudoStep("", nil, "losetup", "-d", "/dev/loop11"),
@@ -147,8 +162,13 @@ func TestReconcileStopsStaleRunningVMAndClearsRuntimeHandles(t *testing.T) {
 	if got.State != model.VMStateStopped || got.Runtime.State != model.VMStateStopped {
 		t.Fatalf("vm state after reconcile = %s/%s, want stopped", got.State, got.Runtime.State)
 	}
-	if got.Runtime.PID != 0 || got.Runtime.APISockPath != "" || got.Runtime.DMName != "" || got.Runtime.COWLoop != "" || got.Runtime.BaseLoop != "" {
-		t.Fatalf("runtime handles not cleared after reconcile: %+v", got.Runtime)
+	// The scratch file must be gone — stopped VMs don't carry handles.
+	if _, err := os.Stat(handlesFilePath(vmDir)); !os.IsNotExist(err) {
+		t.Fatalf("handles.json still present after reconcile: %v", err)
+	}
+	// And the in-memory cache must be empty.
+	if h, ok := d.handles.get(vm.ID); ok && !h.IsZero() {
+		t.Fatalf("handle cache not cleared after reconcile: %+v", h)
 	}
 }
 
@@ -168,13 +188,11 @@ func TestRebuildDNSIncludesOnlyLiveRunningVMs(t *testing.T) {
 	live := testVM("live", "image-live", "172.16.0.21")
 	live.State = model.VMStateRunning
 	live.Runtime.State = model.VMStateRunning
-	live.Runtime.PID = liveCmd.Process.Pid
 	live.Runtime.APISockPath = liveSock
 
 	stale := testVM("stale", "image-stale", "172.16.0.22")
 	stale.State = model.VMStateRunning
 	stale.Runtime.State = model.VMStateRunning
-	stale.Runtime.PID = 999999
 	stale.Runtime.APISockPath = filepath.Join(t.TempDir(), "stale.sock")
 
 	stopped := testVM("stopped", "image-stopped", "172.16.0.23")
@@ -195,6 +213,11 @@ func TestRebuildDNSIncludesOnlyLiveRunningVMs(t *testing.T) {
 	})
 
 	d := &Daemon{store: db, vmDNS: server}
+	// rebuildDNS reads the alive check from the handle cache. Seed
+	// the live VM with its real PID; leave the stale entry with a PID
+	// that definitely isn't running (999999 ≫ max PID on most hosts).
+	d.setVMHandlesInMemory(live.ID, model.VMHandles{PID: liveCmd.Process.Pid})
+	d.setVMHandlesInMemory(stale.ID, model.VMHandles{PID: 999999})
 	if err := d.rebuildDNS(ctx); err != nil {
 		t.Fatalf("rebuildDNS: %v", err)
 	}
@@ -225,11 +248,11 @@ func TestSetVMRejectsStoppedOnlyChangesForRunningVM(t *testing.T) {
 	vm := testVM("running", "image-run", "172.16.0.10")
 	vm.State = model.VMStateRunning
 	vm.Runtime.State = model.VMStateRunning
-	vm.Runtime.PID = cmd.Process.Pid
 	vm.Runtime.APISockPath = apiSock
 	upsertDaemonVM(t, ctx, db, vm)
 
 	d := &Daemon{store: db}
+	d.setVMHandlesInMemory(vm.ID, model.VMHandles{PID: cmd.Process.Pid})
 	tests := []struct {
 		name   string
 		params api.VMSetParams
@@ -330,12 +353,12 @@ func TestHealthVMReturnsHealthyForRunningGuest(t *testing.T) {
 	vm := testVM("alive", "image-alive", "172.16.0.41")
 	vm.State = model.VMStateRunning
 	vm.Runtime.State = model.VMStateRunning
-	vm.Runtime.PID = fake.Process.Pid
 	vm.Runtime.APISockPath = apiSock
 	vm.Runtime.VSockPath = vsockSock
 	vm.Runtime.VSockCID = 10041
 	upsertDaemonVM(t, ctx, db, vm)
 
+	handlePID := fake.Process.Pid
 	runner := &scriptedRunner{
 		t: t,
 		steps: []runnerStep{
@@ -344,6 +367,7 @@ func TestHealthVMReturnsHealthyForRunningGuest(t *testing.T) {
 		},
 	}
 	d := &Daemon{store: db, runner: runner}
+	d.setVMHandlesInMemory(vm.ID, model.VMHandles{PID: handlePID})
 	result, err := d.HealthVM(ctx, vm.Name)
 	if err != nil {
 		t.Fatalf("HealthVM: %v", err)
@@ -393,7 +417,6 @@ func TestPingVMAliasReturnsAliveForHealthyVM(t *testing.T) {
 	vm := testVM("healthy-ping", "image-healthy", "172.16.0.42")
 	vm.State = model.VMStateRunning
 	vm.Runtime.State = model.VMStateRunning
-	vm.Runtime.PID = fake.Process.Pid
 	vm.Runtime.APISockPath = apiSock
 	vm.Runtime.VSockPath = vsockSock
 	vm.Runtime.VSockCID = 10042
@@ -407,6 +430,7 @@ func TestPingVMAliasReturnsAliveForHealthyVM(t *testing.T) {
 		},
 	}
 	d := &Daemon{store: db, runner: runner}
+	d.setVMHandlesInMemory(vm.ID, model.VMHandles{PID: fake.Process.Pid})
 	result, err := d.PingVM(ctx, vm.Name)
 	if err != nil {
 		t.Fatalf("PingVM: %v", err)
@@ -590,7 +614,6 @@ func TestPortsVMReturnsEnrichedPortsAndWebSchemes(t *testing.T) {
 	vm := testVM("ports", "image-ports", "127.0.0.1")
 	vm.State = model.VMStateRunning
 	vm.Runtime.State = model.VMStateRunning
-	vm.Runtime.PID = fake.Process.Pid
 	vm.Runtime.APISockPath = apiSock
 	vm.Runtime.VSockPath = vsockSock
 	vm.Runtime.VSockCID = 10043
@@ -604,6 +627,7 @@ func TestPortsVMReturnsEnrichedPortsAndWebSchemes(t *testing.T) {
 		},
 	}
 	d := &Daemon{store: db, runner: runner}
+	d.setVMHandlesInMemory(vm.ID, model.VMHandles{PID: fake.Process.Pid})
 
 	result, err := d.PortsVM(ctx, vm.Name)
 	if err != nil {
@@ -1341,8 +1365,10 @@ func TestCleanupRuntimeRediscoversLiveFirecrackerPID(t *testing.T) {
 	}
 	d := &Daemon{runner: runner}
 	vm := testVM("cleanup", "image-cleanup", "172.16.0.22")
-	vm.Runtime.PID = fake.Process.Pid + 999
 	vm.Runtime.APISockPath = apiSock
+	// Seed a stale PID so cleanupRuntime's findFirecrackerPID pgrep
+	// fallback wins — it rediscovers fake.Process.Pid from apiSock.
+	d.setVMHandlesInMemory(vm.ID, model.VMHandles{PID: fake.Process.Pid + 999})
 
 	if err := d.cleanupRuntime(context.Background(), vm, true); err != nil {
 		t.Fatalf("cleanupRuntime returned error: %v", err)
@@ -1366,7 +1392,6 @@ func TestDeleteStoppedNATVMDoesNotFailWithoutTapDevice(t *testing.T) {
 	vm := testVM("stopped-nat", "image-stopped-nat", "172.16.0.24")
 	vm.Spec.NATEnabled = true
 	vm.Runtime.VMDir = vmDir
-	vm.Runtime.TapDevice = ""
 	vm.State = model.VMStateStopped
 	vm.Runtime.State = model.VMStateStopped
 	upsertDaemonVM(t, ctx, db, vm)
@@ -1410,7 +1435,6 @@ func TestStopVMFallsBackToForcedCleanupAfterGracefulTimeout(t *testing.T) {
 	vm := testVM("stubborn", "image-stubborn", "172.16.0.23")
 	vm.State = model.VMStateRunning
 	vm.Runtime.State = model.VMStateRunning
-	vm.Runtime.PID = fake.Process.Pid
 	vm.Runtime.APISockPath = apiSock
 	upsertDaemonVM(t, ctx, db, vm)
 
@@ -1427,6 +1451,7 @@ func TestStopVMFallsBackToForcedCleanupAfterGracefulTimeout(t *testing.T) {
 		proc: fake,
 	}
 	d := &Daemon{store: db, runner: runner}
+	d.setVMHandlesInMemory(vm.ID, model.VMHandles{PID: fake.Process.Pid})
 
 	got, err := d.StopVM(ctx, vm.ID)
 	if err != nil {
@@ -1436,8 +1461,11 @@ func TestStopVMFallsBackToForcedCleanupAfterGracefulTimeout(t *testing.T) {
 	if got.State != model.VMStateStopped || got.Runtime.State != model.VMStateStopped {
 		t.Fatalf("StopVM state = %s/%s, want stopped", got.State, got.Runtime.State)
 	}
-	if got.Runtime.PID != 0 || got.Runtime.APISockPath != "" {
-		t.Fatalf("runtime handles not cleared: %+v", got.Runtime)
+	// APISockPath + VSock paths are deterministic — they stay on the
+	// record for debugging and next-start reuse even after stop. The
+	// post-stop invariant is that the in-memory cache is empty.
+	if h, ok := d.handles.get(vm.ID); ok && !h.IsZero() {
+		t.Fatalf("handle cache not cleared: %+v", h)
 	}
 }
 
diff --git a/internal/daemon/workspace.go b/internal/daemon/workspace.go
index f94085b..531e98c 100644
--- a/internal/daemon/workspace.go
+++ b/internal/daemon/workspace.go
@@ -13,7 +13,6 @@ import (
 	sess "banger/internal/daemon/session"
 	ws "banger/internal/daemon/workspace"
 	"banger/internal/model"
-	"banger/internal/system"
 )
 
 // Test seams. Tests swap these to observe or stall the guest-I/O
@@ -33,7 +32,7 @@ func (d *Daemon) ExportVMWorkspace(ctx context.Context, params api.WorkspaceExpo
 	if err != nil {
 		return api.WorkspaceExportResult{}, err
 	}
-	if vm.State != model.VMStateRunning || !system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
+	if !d.vmAlive(vm) {
 		return api.WorkspaceExportResult{}, fmt.Errorf("vm %q is not running", vm.Name)
 	}
 	// Serialise with any in-flight workspace.prepare on the same VM so
@@ -133,7 +132,7 @@ func (d *Daemon) PrepareVMWorkspace(ctx context.Context, params api.VMWorkspaceP
 	// before any SSH or tar I/O so this slow operation cannot block
 	// vm stop / vm delete / vm restart on the same VM.
 	vm, err := d.withVMLockByRef(ctx, params.IDOrName, func(vm model.VMRecord) (model.VMRecord, error) {
-		if vm.State != model.VMStateRunning || !system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
+		if !d.vmAlive(vm) {
 			return model.VMRecord{}, fmt.Errorf("vm %q is not running", vm.Name)
 		}
 		return vm, nil
diff --git a/internal/daemon/workspace_test.go b/internal/daemon/workspace_test.go
index 49f7ff4..2194dce 100644
--- a/internal/daemon/workspace_test.go
+++ b/internal/daemon/workspace_test.go
@@ -81,7 +81,6 @@ func TestExportVMWorkspace_HappyPath(t *testing.T) {
 	vm := testVM("exportbox", "image-export", "172.16.0.100")
 	vm.State = model.VMStateRunning
 	vm.Runtime.State = model.VMStateRunning
-	vm.Runtime.PID = firecracker.Process.Pid
 	vm.Runtime.APISockPath = apiSock
 
 	patch := []byte("diff --git a/file.go b/file.go\nindex 0000000..1111111 100644\n")
@@ -95,6 +94,7 @@ func TestExportVMWorkspace_HappyPath(t *testing.T) {
 	}
 	d := newExportTestDaemonStore(t, fake)
 	upsertDaemonVM(t, ctx, d.store, vm)
+	d.setVMHandlesInMemory(vm.ID, model.VMHandles{PID: firecracker.Process.Pid})
 
 	result, err := d.ExportVMWorkspace(ctx, api.WorkspaceExportParams{
 		IDOrName:  vm.Name,
@@ -139,7 +139,6 @@ func TestExportVMWorkspace_WithBaseCommit(t *testing.T) {
 	vm := testVM("exportbox-base", "image-export", "172.16.0.105")
 	vm.State = model.VMStateRunning
 	vm.Runtime.State = model.VMStateRunning
-	vm.Runtime.PID = firecracker.Process.Pid
 	vm.Runtime.APISockPath = apiSock
 
 	// Simulate: worker committed inside the VM. Without base_commit the diff
@@ -156,6 +155,7 @@ func TestExportVMWorkspace_WithBaseCommit(t *testing.T) {
 	}
 	d := newExportTestDaemonStore(t, fake)
 	upsertDaemonVM(t, ctx, d.store, vm)
+	d.setVMHandlesInMemory(vm.ID, model.VMHandles{PID: firecracker.Process.Pid})
 
 	const prepareCommit = "abc1234deadbeef"
 	result, err := d.ExportVMWorkspace(ctx, api.WorkspaceExportParams{
@@ -192,7 +192,6 @@ func TestExportVMWorkspace_BaseCommitFallsBackToHEAD(t *testing.T) {
 	vm := testVM("exportbox-nobase", "image-export", "172.16.0.106")
 	vm.State = model.VMStateRunning
 	vm.Runtime.State = model.VMStateRunning
-	vm.Runtime.PID = firecracker.Process.Pid
 	vm.Runtime.APISockPath = apiSock
 
 	fake := &exportGuestClient{
@@ -203,6 +202,7 @@ func TestExportVMWorkspace_BaseCommitFallsBackToHEAD(t *testing.T) {
 	}
 	d := newExportTestDaemonStore(t, fake)
 	upsertDaemonVM(t, ctx, d.store, vm)
+	d.setVMHandlesInMemory(vm.ID, model.VMHandles{PID: firecracker.Process.Pid})
 
 	result, err := d.ExportVMWorkspace(ctx, api.WorkspaceExportParams{
 		IDOrName:   vm.Name,
@@ -231,7 +231,6 @@ func TestExportVMWorkspace_NoChanges(t *testing.T) {
 	vm := testVM("exportbox-empty", "image-export", "172.16.0.101")
 	vm.State = model.VMStateRunning
 	vm.Runtime.State = model.VMStateRunning
-	vm.Runtime.PID = firecracker.Process.Pid
 	vm.Runtime.APISockPath = apiSock
 
 	// Both scripts return empty output (no changes).
@@ -243,6 +242,7 @@ func TestExportVMWorkspace_NoChanges(t *testing.T) {
 	}
 	d := newExportTestDaemonStore(t, fake)
 	upsertDaemonVM(t, ctx, d.store, vm)
+	d.setVMHandlesInMemory(vm.ID, model.VMHandles{PID: firecracker.Process.Pid})
 
 	result, err := d.ExportVMWorkspace(ctx, api.WorkspaceExportParams{
 		IDOrName: vm.Name,
@@ -271,7 +271,6 @@ func TestExportVMWorkspace_DefaultGuestPath(t *testing.T) {
 	vm := testVM("exportbox-default", "image-export", "172.16.0.102")
 	vm.State = model.VMStateRunning
 	vm.Runtime.State = model.VMStateRunning
-	vm.Runtime.PID = firecracker.Process.Pid
 	vm.Runtime.APISockPath = apiSock
 
 	fake := &exportGuestClient{
@@ -282,6 +281,7 @@ func TestExportVMWorkspace_DefaultGuestPath(t *testing.T) {
 	}
 	d := newExportTestDaemonStore(t, fake)
 	upsertDaemonVM(t, ctx, d.store, vm)
+	d.setVMHandlesInMemory(vm.ID, model.VMHandles{PID: firecracker.Process.Pid})
 
 	// GuestPath omitted — should default to /root/repo.
 	result, err := d.ExportVMWorkspace(ctx, api.WorkspaceExportParams{
@@ -305,6 +305,7 @@ func TestExportVMWorkspace_VMNotRunning(t *testing.T) {
 	fake := &exportGuestClient{}
 	d := newExportTestDaemonStore(t, fake)
 	upsertDaemonVM(t, ctx, d.store, vm)
+	// VM is stopped — no handle seed; vmAlive must return false.
 
 	_, err := d.ExportVMWorkspace(ctx, api.WorkspaceExportParams{
 		IDOrName: vm.Name,
@@ -327,7 +328,6 @@ func TestExportVMWorkspace_MultipleChangedFiles(t *testing.T) {
 	vm := testVM("exportbox-multi", "image-export", "172.16.0.104")
 	vm.State = model.VMStateRunning
 	vm.Runtime.State = model.VMStateRunning
-	vm.Runtime.PID = firecracker.Process.Pid
 	vm.Runtime.APISockPath = apiSock
 
 	patch := []byte("diff --git a/a.go b/a.go\n--- a/a.go\n+++ b/a.go\n")
@@ -341,6 +341,7 @@ func TestExportVMWorkspace_MultipleChangedFiles(t *testing.T) {
 	}
 	d := newExportTestDaemonStore(t, fake)
 	upsertDaemonVM(t, ctx, d.store, vm)
+	d.setVMHandlesInMemory(vm.ID, model.VMHandles{PID: firecracker.Process.Pid})
 
 	result, err := d.ExportVMWorkspace(ctx, api.WorkspaceExportParams{
 		IDOrName: vm.Name,
@@ -380,7 +381,6 @@ func TestPrepareVMWorkspace_ReleasesVMLockDuringGuestIO(t *testing.T) {
 	vm := testVM("lockbox", "image-x", "172.16.0.210")
 	vm.State = model.VMStateRunning
 	vm.Runtime.State = model.VMStateRunning
-	vm.Runtime.PID = firecracker.Process.Pid
 	vm.Runtime.APISockPath = apiSock
 
 	d := &Daemon{
@@ -393,6 +393,7 @@ func TestPrepareVMWorkspace_ReleasesVMLockDuringGuestIO(t *testing.T) {
 		return &exportGuestClient{}, nil
 	}
 	upsertDaemonVM(t, ctx, d.store, vm)
+	d.setVMHandlesInMemory(vm.ID, model.VMHandles{PID: firecracker.Process.Pid})
 
 	// Replace the seams. InspectRepo returns a trivial spec so the
 	// real filesystem isn't touched; Import blocks until we say go.
@@ -473,7 +474,6 @@ func TestPrepareVMWorkspace_SerialisesConcurrentPreparesOnSameVM(t *testing.T) {
 	vm := testVM("serialbox", "image-x", "172.16.0.211")
 	vm.State = model.VMStateRunning
 	vm.Runtime.State = model.VMStateRunning
-	vm.Runtime.PID = firecracker.Process.Pid
 	vm.Runtime.APISockPath = apiSock
 
 	d := &Daemon{
@@ -486,6 +486,7 @@ func TestPrepareVMWorkspace_SerialisesConcurrentPreparesOnSameVM(t *testing.T) {
 		return &exportGuestClient{}, nil
 	}
 	upsertDaemonVM(t, ctx, d.store, vm)
+	d.setVMHandlesInMemory(vm.ID, model.VMHandles{PID: firecracker.Process.Pid})
 
 	origInspect := workspaceInspectRepoFunc
 	origImport := workspaceImportFunc
@@ -569,7 +570,6 @@ func TestExportVMWorkspace_DoesNotMutateRealIndex(t *testing.T) {
 	vm := testVM("exportbox-readonly", "image-export", "172.16.0.107")
 	vm.State = model.VMStateRunning
 	vm.Runtime.State = model.VMStateRunning
-	vm.Runtime.PID = firecracker.Process.Pid
 	vm.Runtime.APISockPath = apiSock
 
 	fake := &exportGuestClient{
@@ -580,6 +580,7 @@ func TestExportVMWorkspace_DoesNotMutateRealIndex(t *testing.T) {
 	}
 	d := newExportTestDaemonStore(t, fake)
 	upsertDaemonVM(t, ctx, d.store, vm)
+	d.setVMHandlesInMemory(vm.ID, model.VMHandles{PID: firecracker.Process.Pid})
 
 	if _, err := d.ExportVMWorkspace(ctx, api.WorkspaceExportParams{IDOrName: vm.Name}); err != nil {
 		t.Fatalf("ExportVMWorkspace: %v", err)
diff --git a/internal/model/types.go b/internal/model/types.go
index 673ac3d..5d8cd0a 100644
--- a/internal/model/types.go
+++ b/internal/model/types.go
@@ -107,11 +107,22 @@ type VMSpec struct {
 	NATEnabled            bool  `json:"nat_enabled"`
 }
 
+// VMRuntime holds the durable runtime state that the daemon needs
+// to reach a VM: identity, declared state, and deterministic derived
+// paths. Transient kernel/process handles (PID, tap, loop devices,
+// dm-snapshot names) live on VMHandles, NOT here — the daemon keeps
+// them in an in-memory cache backed by a per-VM handles.json scratch
+// file, so a daemon restart rebuilds them from OS state rather than
+// trusting whatever was last written into a SQLite column.
+//
+// Everything in VMRuntime is safe to persist: the paths are
+// deterministic from (VM ID, layout) and survive restart unchanged;
+// GuestIP and DNSName are assigned at create time and never move;
+// LastError carries the last failure message for debugging. State
+// mirrors VMRecord.State.
 type VMRuntime struct {
 	State         VMState `json:"state"`
-	PID           int     `json:"pid,omitempty"`
 	GuestIP       string  `json:"guest_ip"`
-	TapDevice     string  `json:"tap_device,omitempty"`
 	APISockPath   string  `json:"api_sock_path,omitempty"`
 	VSockPath     string  `json:"vsock_path,omitempty"`
 	VSockCID      uint32  `json:"vsock_cid,omitempty"`
@@ -121,10 +132,6 @@ type VMRuntime struct {
 	VMDir         string  `json:"vm_dir"`
 	SystemOverlay string  `json:"system_overlay_path"`
 	WorkDiskPath  string  `json:"work_disk_path"`
-	BaseLoop      string  `json:"base_loop,omitempty"`
-	COWLoop       string  `json:"cow_loop,omitempty"`
-	DMName        string  `json:"dm_name,omitempty"`
-	DMDev         string  `json:"dm_dev,omitempty"`
 	LastError     string  `json:"last_error,omitempty"`
 }
 
diff --git a/internal/model/vm_handles.go b/internal/model/vm_handles.go
new file mode 100644
index 0000000..8a68071
--- /dev/null
+++ b/internal/model/vm_handles.go
@@ -0,0 +1,51 @@
+package model
+
+// VMHandles captures the transient, per-boot kernel/process handles
+// that banger obtains while starting a VM and releases when stopping
+// it. Unlike VMRuntime (durable spec + identity + derived paths),
+// nothing in VMHandles survives a daemon restart in authoritative
+// form: each value is either rediscovered from the OS (PID from the
+// firecracker api socket, DM name deterministically from the VM ID)
+// or read from a per-VM scratch file that the daemon rebuilds at
+// every start.
+//
+// The daemon keeps an in-memory cache keyed by VM ID. Lifecycle
+// transitions update the cache and a small `handles.json` scratch
+// file in the VM's state directory; daemon startup reconciles
+// by loading that file and verifying each handle against the live
+// OS state. If anything is stale the VM is marked stopped and the
+// cache entry is dropped.
+//
+// VMHandles never appears in the `vms` SQLite rows. Keeping it off
+// the durable schema was the whole point of the split — persistent
+// records describe what a VM SHOULD be; handles describe what is
+// currently true about it.
+type VMHandles struct {
+	// PID is the firecracker process PID. Zero means "not running
+	// (from our perspective)". Always verifiable via
+	// /proc/<pid>/cmdline matching the api socket path.
+	PID int `json:"pid,omitempty"`
+
+	// TapDevice is the kernel tap interface name (e.g. "tap-fc-0001")
+	// bound to the VM's virtio-net. Released on stop.
+	TapDevice string `json:"tap_device,omitempty"`
+
+	// BaseLoop and COWLoop are the two loop devices backing the
+	// dm-snapshot layer (read-only base = rootfs; read-write overlay
+	// = per-VM COW file). Released via losetup -d on stop.
+	BaseLoop string `json:"base_loop,omitempty"`
+	COWLoop  string `json:"cow_loop,omitempty"`
+
+	// DMName is the device-mapper target name; deterministic from the
+	// VM ID (see dmsnap.SnapshotName). DMDev is the corresponding
+	// /dev/mapper/<name> path. Torn down by `dmsetup remove` on stop.
+	DMName string `json:"dm_name,omitempty"`
+	DMDev  string `json:"dm_dev,omitempty"`
+}
+
+// IsZero reports whether every handle field is unset. Useful as a
+// cheap "this VM has no kernel/process resources held on our behalf"
+// check.
+func (h VMHandles) IsZero() bool {
+	return h.PID == 0 && h.TapDevice == "" && h.BaseLoop == "" && h.COWLoop == "" && h.DMName == "" && h.DMDev == ""
+}
diff --git a/internal/store/store_test.go b/internal/store/store_test.go
index 164ad4e..ea535fc 100644
--- a/internal/store/store_test.go
+++ b/internal/store/store_test.go
@@ -372,7 +372,6 @@ func sampleVM(name, imageID, guestIP string) model.VMRecord {
 		Runtime: model.VMRuntime{
 			State:         model.VMStateStopped,
 			GuestIP:       guestIP,
-			TapDevice:     "tap-" + name,
 			APISockPath:   "/tmp/" + name + ".sock",
 			LogPath:       "/tmp/" + name + ".log",
 			MetricsPath:   "/tmp/" + name + ".metrics",