daemon: tighten concurrency around pulls, cleanup, and handle persistence

Four targeted fixes from a race-condition audit of the daemon package. None change behaviour on the happy path; each closes a window where a concurrent or interrupted RPC could strand state on the host. - KernelDelete now holds the same per-name lock as KernelPull / readOrAutoPullKernel. Without it, a delete racing a concurrent pull could remove files mid-write or land between the pull's manifest write and its first use. - cleanupRuntime no longer early-returns on an inner waitForExit failure; DM snapshot, capability, and tap teardown always run and every error is folded into the returned errors.Join. EBUSY against a still-alive firecracker is benign and surfaces in the joined error rather than stranding kernel state across daemon restarts. - Per-name image / kernel pull locks switch from *sync.Mutex to a 1-buffered chan struct{}. Acquire is a select on ctx.Done(), so a peer waiting behind a pull whose RPC was cancelled can bail out instead of blocking forever on a pull nobody is consuming. - setVMHandles writes the per-VM scratch file before updating the in-memory cache. A daemon crash between the two now leaves disk ahead of memory (recoverable: reconcile re-seeds the cache from the file on next start) rather than memory ahead of disk (lost handles → stranded DM/loops/tap). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-27 19:32:43 -03:00 · 2026-04-27 19:32:43 -03:00 · c4e1cb5953
commit c4e1cb5953
parent 777b597a1e
6 changed files with 99 additions and 39 deletions
--- a/internal/daemon/vm.go
+++ b/internal/daemon/vm.go
@ -99,6 +99,15 @@ func teardownHandlesForCleanup(vm model.VMRecord, live model.VMHandles) model.VM
 // because it reaches into handles (VMService-owned); the capability
 // teardown goes through the capHooks seam to keep Daemon out of the
 // dependency chain.
+//
+// Idempotency contract: every step runs even when an earlier step
+// fails, and the per-step errors are joined into the returned value.
+// A waitForExit timeout (firecracker refused to die) used to early-
+// return, leaving DM/feature/tap state stranded on the host across
+// daemon restarts. With collect-and-continue the kernel teardowns
+// still attempt; in the worst case (firecracker actually still alive)
+// they fail with EBUSY which is also surfaced via errors.Join — no
+// damage, but the operator sees the full picture.
 func (s *VMService) cleanupRuntime(ctx context.Context, vm model.VMRecord, preserveDisks bool) error {
 	if s.logger != nil {
 		s.logger.Debug("cleanup runtime", append(vmLogAttrs(vm), "preserve_disks", preserveDisks)...)
@ -110,10 +119,12 @@ func (s *VMService) cleanupRuntime(ctx context.Context, vm model.VMRecord, prese
 			cleanupPID = pid
 		}
 	}
+	var waitErr error
 	if cleanupPID > 0 && system.ProcessRunning(cleanupPID, vm.Runtime.APISockPath) {
 		_ = s.net.killVMProcess(ctx, cleanupPID)
-		if err := s.net.waitForExit(ctx, cleanupPID, vm.Runtime.APISockPath, 30*time.Second); err != nil {
-			return err
+		waitErr = s.net.waitForExit(ctx, cleanupPID, vm.Runtime.APISockPath, 30*time.Second)
+		if waitErr != nil && s.logger != nil {
+			s.logger.Warn("cleanup wait_for_exit failed; continuing teardown", append(vmLogAttrs(vm), "pid", cleanupPID, "error", waitErr.Error())...)
 		}
 	}
 	handles := teardownHandlesForCleanup(vm, h)
@ -143,9 +154,9 @@ func (s *VMService) cleanupRuntime(ctx context.Context, vm model.VMRecord, prese
 	// when the caller forgets to call clearVMHandles explicitly.
 	s.clearVMHandles(vm)
 	if !preserveDisks && vm.Runtime.VMDir != "" {
-		return errors.Join(snapshotErr, featureErr, tapErr, os.RemoveAll(vm.Runtime.VMDir))
+		return errors.Join(waitErr, snapshotErr, featureErr, tapErr, os.RemoveAll(vm.Runtime.VMDir))
 	}
-	return errors.Join(snapshotErr, featureErr, tapErr)
+	return errors.Join(waitErr, snapshotErr, featureErr, tapErr)
 }

 func (s *VMService) generateName(ctx context.Context) (string, error) {