daemon: tighten concurrency around pulls, cleanup, and handle persistence

Four targeted fixes from a race-condition audit of the daemon package.
None change behaviour on the happy path; each closes a window where a
concurrent or interrupted RPC could strand state on the host.

  - KernelDelete now holds the same per-name lock as KernelPull /
    readOrAutoPullKernel. Without it, a delete racing a concurrent
    pull could remove files mid-write or land between the pull's
    manifest write and its first use.

  - cleanupRuntime no longer early-returns on an inner waitForExit
    failure; DM snapshot, capability, and tap teardown always run and
    every error is folded into the returned errors.Join. EBUSY against
    a still-alive firecracker is benign and surfaces in the joined
    error rather than stranding kernel state across daemon restarts.

  - Per-name image / kernel pull locks switch from *sync.Mutex to a
    1-buffered chan struct{}. Acquire is a select on ctx.Done(), so a
    peer waiting behind a pull whose RPC was cancelled can bail out
    instead of blocking forever on a pull nobody is consuming.

  - setVMHandles writes the per-VM scratch file before updating the
    in-memory cache. A daemon crash between the two now leaves disk
    ahead of memory (recoverable: reconcile re-seeds the cache from
    the file on next start) rather than memory ahead of disk (lost
    handles → stranded DM/loops/tap).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Thales Maciel 2026-04-27 19:32:43 -03:00
parent 777b597a1e
commit c4e1cb5953
No known key found for this signature in database
GPG key ID: 33112E6833C34679
6 changed files with 99 additions and 39 deletions

View file

@ -34,10 +34,19 @@ func (s *ImageService) KernelShow(_ context.Context, name string) (api.KernelEnt
return kernelEntryToAPI(entry), nil
}
func (s *ImageService) KernelDelete(_ context.Context, name string) error {
func (s *ImageService) KernelDelete(ctx context.Context, name string) error {
if err := kernelcat.ValidateName(name); err != nil {
return err
}
// Hold the same per-name lock KernelPull / readOrAutoPullKernel
// take. Without it, a delete racing a concurrent pull can land
// between the pull's manifest write and the entry's first use,
// or remove files the pull is still writing.
release, err := s.acquireKernelPullLock(ctx, name)
if err != nil {
return err
}
defer release()
return kernelcat.DeleteLocal(s.layout.KernelsDir, name)
}
@ -129,9 +138,11 @@ func (s *ImageService) KernelPull(ctx context.Context, params api.KernelPullPara
return api.KernelEntry{}, err
}
lock := s.kernelPullLock(name)
lock.Lock()
defer lock.Unlock()
release, err := s.acquireKernelPullLock(ctx, name)
if err != nil {
return api.KernelEntry{}, err
}
defer release()
if !params.Force {
if _, err := kernelcat.ReadLocal(s.layout.KernelsDir, name); err == nil {