daemon: tighten concurrency around pulls, cleanup, and handle persistence

Four targeted fixes from a race-condition audit of the daemon package. None change behaviour on the happy path; each closes a window where a concurrent or interrupted RPC could strand state on the host. - KernelDelete now holds the same per-name lock as KernelPull / readOrAutoPullKernel. Without it, a delete racing a concurrent pull could remove files mid-write or land between the pull's manifest write and its first use. - cleanupRuntime no longer early-returns on an inner waitForExit failure; DM snapshot, capability, and tap teardown always run and every error is folded into the returned errors.Join. EBUSY against a still-alive firecracker is benign and surfaces in the joined error rather than stranding kernel state across daemon restarts. - Per-name image / kernel pull locks switch from *sync.Mutex to a 1-buffered chan struct{}. Acquire is a select on ctx.Done(), so a peer waiting behind a pull whose RPC was cancelled can bail out instead of blocking forever on a pull nobody is consuming. - setVMHandles writes the per-VM scratch file before updating the in-memory cache. A daemon crash between the two now leaves disk ahead of memory (recoverable: reconcile re-seeds the cache from the file on next start) rather than memory ahead of disk (lost handles → stranded DM/loops/tap). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-27 19:32:43 -03:00 · 2026-04-27 19:32:43 -03:00 · c4e1cb5953
commit c4e1cb5953
parent 777b597a1e
6 changed files with 99 additions and 39 deletions
--- a/internal/daemon/vm_handles.go
+++ b/internal/daemon/vm_handles.go
@ -138,16 +138,23 @@ func (s *VMService) vmHandles(vmID string) model.VMHandles {
 // fields onto VMRuntime, and writes the per-VM scratch file.
 // Scratch-file errors are logged but not returned; the cache remains
 // authoritative while the daemon is alive.
+//
+// Write order: file first, cache second. A daemon crash between the
+// two leaves the on-disk scratch file ahead of the in-memory cache —
+// which is the recoverable direction, since reconcile re-seeds the
+// cache from the file on the next start. The reverse order would let
+// a crash strand handles the daemon already saw as live but never
+// persisted, breaking the next-start teardown of DM/loops/tap.
 func (s *VMService) setVMHandles(vm *model.VMRecord, h model.VMHandles) {
 	if s == nil || vm == nil {
 		return
 	}
 	persistRuntimeTeardownState(vm, h)
 	s.ensureHandleCache()
-	s.handles.set(vm.ID, h)
 	if err := writeHandlesFile(vm.Runtime.VMDir, h); err != nil && s.logger != nil {
 		s.logger.Warn("persist handles.json failed", "vm_id", vm.ID, "error", err.Error())
 	}
+	s.handles.set(vm.ID, h)
 }

 // clearVMHandles drops the cache entry and removes the scratch