daemon: shrink createVMMu + imageOpsMu to reservation/publication windows

Before: createVMMu was held across the whole of CreateVM — including
image resolution (which could fire a full auto-pull) and startVMLocked
(boot of multiple seconds). imageOpsMu was held across the whole of
PullImage/RegisterImage/PromoteImage/DeleteImage, so any slow OCI pull,
bundle download, or file copy blocked every other image mutation and
every other VM create that needed to auto-pull. The async create API
bought nothing if all creates serialised on the same mutex.

CreateVM is now three phases:

 1. Validate + resolve image (possibly auto-pulling). No global lock.
 2. reserveVM: take createVMMu only long enough to re-check the name
    is free, allocate the next guest IP, and UpsertVM the "created"
    row. Milliseconds.
 3. startVMLocked: run the full boot flow under the per-VM lock only.

Parallel creates of different VMs now overlap on image resolution +
boot; they contend only across the reservation claim.

For the image surface a new publishImage helper isolates the commit
atom (recheck name free, atomic rename stagingDir→finalDir, UpsertImage)
under imageOpsMu. pullFromBundle + pullFromOCI do their network fetch
+ ext4 build + ownership fixup + agent injection outside the lock;
Register moves validation + kernel resolution outside; Promote moves
file copy + SSH-key seeding outside; Delete keeps a brief lock over
the lookup + reference check + store delete and does file cleanup
unlocked.

Two concurrency tests assert the new behaviour:
 - TestPullImageDoesNotSerialiseOnDifferentNames fails the old code
   (second pull blocks on imageOpsMu and never reaches the body).
 - TestPullImageRejectsNameClashAtPublish confirms the publish-window
   recheck is what enforces name uniqueness now that the body runs
   unlocked — exactly one winner.

ARCHITECTURE.md updated to describe the new scope explicitly instead
of calling the locks "narrow".

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Thales Maciel 2026-04-20 13:44:22 -03:00
parent afe91e805a
commit 99d0811097
No known key found for this signature in database
GPG key ID: 33112E6833C34679
5 changed files with 390 additions and 95 deletions

View file

@ -36,10 +36,15 @@ const minPullExt4Size int64 = 1 << 30 // 1 GiB
//
// Kernel info falls back through: `params.KernelRef` → catalog entry's
// `kernel_ref` (bundle path only) → `params.Kernel/Initrd/ModulesDir`.
//
// Concurrency: the slow staging work (network fetch, ext4 build,
// ownership fixup, guest-agent injection) runs WITHOUT imageOpsMu so
// parallel pulls of different images interleave. imageOpsMu is taken
// only for the publish window — recheck name is free, rename the
// staging dir to the final artifact dir, insert the store row. If two
// pulls race to the same name, the loser fails fast at the recheck
// and its staging dir is cleaned up via defer.
func (d *Daemon) PullImage(ctx context.Context, params api.ImagePullParams) (model.Image, error) {
d.imageOpsMu.Lock()
defer d.imageOpsMu.Unlock()
ref := strings.TrimSpace(params.Ref)
if ref == "" {
return model.Image{}, errors.New("reference is required")
@ -55,6 +60,38 @@ func (d *Daemon) PullImage(ctx context.Context, params api.ImagePullParams) (mod
return d.pullFromOCI(ctx, params)
}
// publishImage is the narrow critical section shared by every image-
// creation path (pull bundle/OCI, register, promote). It re-verifies
// that `image.Name` is still free, atomically renames the staging
// directory to its final home (when applicable), and persists the row.
// The caller owns stagingDir cleanup on failure via its own defer; on
// success, publishImage unsets it so the defer is a no-op.
//
// finalDir == "" means "already published" (the caller built artifacts
// in place, e.g. RegisterImage which only touches the store). When
// non-empty the rename is the publication atom: finalDir must not
// already exist before the rename fires.
func (d *Daemon) publishImage(ctx context.Context, image model.Image, stagingDir, finalDir string) (model.Image, error) {
d.imageOpsMu.Lock()
defer d.imageOpsMu.Unlock()
if existing, err := d.store.GetImageByName(ctx, image.Name); err == nil {
return model.Image{}, fmt.Errorf("image %q already exists (id=%s); pick a different --name or delete it first", image.Name, existing.ID)
}
if finalDir != "" {
if err := os.Rename(stagingDir, finalDir); err != nil {
return model.Image{}, fmt.Errorf("publish artifact dir: %w", err)
}
}
if err := d.store.UpsertImage(ctx, image); err != nil {
if finalDir != "" {
_ = os.RemoveAll(finalDir)
}
return model.Image{}, err
}
return image, nil
}
// pullFromOCI is the original OCI-registry-pull path. See PullImage for
// the intent.
func (d *Daemon) pullFromOCI(ctx context.Context, params api.ImagePullParams) (image model.Image, err error) {
@ -137,11 +174,6 @@ func (d *Daemon) pullFromOCI(ctx context.Context, params api.ImagePullParams) (i
return model.Image{}, fmt.Errorf("stage boot artifacts: %w", err)
}
if err := os.Rename(stagingDir, finalDir); err != nil {
return model.Image{}, fmt.Errorf("publish artifact dir: %w", err)
}
cleanupStaging = false
now := model.Now()
image = model.Image{
ID: id,
@ -155,11 +187,12 @@ func (d *Daemon) pullFromOCI(ctx context.Context, params api.ImagePullParams) (i
CreatedAt: now,
UpdatedAt: now,
}
if err := d.store.UpsertImage(ctx, image); err != nil {
_ = os.RemoveAll(finalDir)
published, err := d.publishImage(ctx, image, stagingDir, finalDir)
if err != nil {
return model.Image{}, err
}
return image, nil
cleanupStaging = false
return published, nil
}
// pullFromBundle is the imagecat-backed path: download a ready-to-boot
@ -218,11 +251,6 @@ func (d *Daemon) pullFromBundle(ctx context.Context, params api.ImagePullParams,
return model.Image{}, fmt.Errorf("stage boot artifacts: %w", err)
}
if err := os.Rename(stagingDir, finalDir); err != nil {
return model.Image{}, fmt.Errorf("publish artifact dir: %w", err)
}
cleanupStaging = false
now := model.Now()
image = model.Image{
ID: id,
@ -236,11 +264,12 @@ func (d *Daemon) pullFromBundle(ctx context.Context, params api.ImagePullParams,
CreatedAt: now,
UpdatedAt: now,
}
if err := d.store.UpsertImage(ctx, image); err != nil {
_ = os.RemoveAll(finalDir)
published, err := d.publishImage(ctx, image, stagingDir, finalDir)
if err != nil {
return model.Image{}, err
}
return image, nil
cleanupStaging = false
return published, nil
}
// runBundleFetch is the seam tests substitute. nil → real implementation.