daemon: shrink createVMMu + imageOpsMu to reservation/publication windows
Before: createVMMu was held across the whole of CreateVM — including
image resolution (which could fire a full auto-pull) and startVMLocked
(boot of multiple seconds). imageOpsMu was held across the whole of
PullImage/RegisterImage/PromoteImage/DeleteImage, so any slow OCI pull,
bundle download, or file copy blocked every other image mutation and
every other VM create that needed to auto-pull. The async create API
bought nothing if all creates serialised on the same mutex.
CreateVM is now three phases:
1. Validate + resolve image (possibly auto-pulling). No global lock.
2. reserveVM: take createVMMu only long enough to re-check the name
is free, allocate the next guest IP, and UpsertVM the "created"
row. Milliseconds.
3. startVMLocked: run the full boot flow under the per-VM lock only.
Parallel creates of different VMs now overlap on image resolution +
boot; they contend only across the reservation claim.
For the image surface a new publishImage helper isolates the commit
atom (recheck name free, atomic rename stagingDir→finalDir, UpsertImage)
under imageOpsMu. pullFromBundle + pullFromOCI do their network fetch
+ ext4 build + ownership fixup + agent injection outside the lock;
Register moves validation + kernel resolution outside; Promote moves
file copy + SSH-key seeding outside; Delete keeps a brief lock over
the lookup + reference check + store delete and does file cleanup
unlocked.
Two concurrency tests assert the new behaviour:
- TestPullImageDoesNotSerialiseOnDifferentNames fails the old code
(second pull blocks on imageOpsMu and never reaches the body).
- TestPullImageRejectsNameClashAtPublish confirms the publish-window
recheck is what enforces name uniqueness now that the body runs
unlocked — exactly one winner.
ARCHITECTURE.md updated to describe the new scope explicitly instead
of calling the locks "narrow".
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
afe91e805a
commit
99d0811097
5 changed files with 390 additions and 95 deletions
|
|
@ -36,10 +36,15 @@ const minPullExt4Size int64 = 1 << 30 // 1 GiB
|
|||
//
|
||||
// Kernel info falls back through: `params.KernelRef` → catalog entry's
|
||||
// `kernel_ref` (bundle path only) → `params.Kernel/Initrd/ModulesDir`.
|
||||
//
|
||||
// Concurrency: the slow staging work (network fetch, ext4 build,
|
||||
// ownership fixup, guest-agent injection) runs WITHOUT imageOpsMu so
|
||||
// parallel pulls of different images interleave. imageOpsMu is taken
|
||||
// only for the publish window — recheck name is free, rename the
|
||||
// staging dir to the final artifact dir, insert the store row. If two
|
||||
// pulls race to the same name, the loser fails fast at the recheck
|
||||
// and its staging dir is cleaned up via defer.
|
||||
func (d *Daemon) PullImage(ctx context.Context, params api.ImagePullParams) (model.Image, error) {
|
||||
d.imageOpsMu.Lock()
|
||||
defer d.imageOpsMu.Unlock()
|
||||
|
||||
ref := strings.TrimSpace(params.Ref)
|
||||
if ref == "" {
|
||||
return model.Image{}, errors.New("reference is required")
|
||||
|
|
@ -55,6 +60,38 @@ func (d *Daemon) PullImage(ctx context.Context, params api.ImagePullParams) (mod
|
|||
return d.pullFromOCI(ctx, params)
|
||||
}
|
||||
|
||||
// publishImage is the narrow critical section shared by every image-
|
||||
// creation path (pull bundle/OCI, register, promote). It re-verifies
|
||||
// that `image.Name` is still free, atomically renames the staging
|
||||
// directory to its final home (when applicable), and persists the row.
|
||||
// The caller owns stagingDir cleanup on failure via its own defer; on
|
||||
// success, publishImage unsets it so the defer is a no-op.
|
||||
//
|
||||
// finalDir == "" means "already published" (the caller built artifacts
|
||||
// in place, e.g. RegisterImage which only touches the store). When
|
||||
// non-empty the rename is the publication atom: finalDir must not
|
||||
// already exist before the rename fires.
|
||||
func (d *Daemon) publishImage(ctx context.Context, image model.Image, stagingDir, finalDir string) (model.Image, error) {
|
||||
d.imageOpsMu.Lock()
|
||||
defer d.imageOpsMu.Unlock()
|
||||
|
||||
if existing, err := d.store.GetImageByName(ctx, image.Name); err == nil {
|
||||
return model.Image{}, fmt.Errorf("image %q already exists (id=%s); pick a different --name or delete it first", image.Name, existing.ID)
|
||||
}
|
||||
if finalDir != "" {
|
||||
if err := os.Rename(stagingDir, finalDir); err != nil {
|
||||
return model.Image{}, fmt.Errorf("publish artifact dir: %w", err)
|
||||
}
|
||||
}
|
||||
if err := d.store.UpsertImage(ctx, image); err != nil {
|
||||
if finalDir != "" {
|
||||
_ = os.RemoveAll(finalDir)
|
||||
}
|
||||
return model.Image{}, err
|
||||
}
|
||||
return image, nil
|
||||
}
|
||||
|
||||
// pullFromOCI is the original OCI-registry-pull path. See PullImage for
|
||||
// the intent.
|
||||
func (d *Daemon) pullFromOCI(ctx context.Context, params api.ImagePullParams) (image model.Image, err error) {
|
||||
|
|
@ -137,11 +174,6 @@ func (d *Daemon) pullFromOCI(ctx context.Context, params api.ImagePullParams) (i
|
|||
return model.Image{}, fmt.Errorf("stage boot artifacts: %w", err)
|
||||
}
|
||||
|
||||
if err := os.Rename(stagingDir, finalDir); err != nil {
|
||||
return model.Image{}, fmt.Errorf("publish artifact dir: %w", err)
|
||||
}
|
||||
cleanupStaging = false
|
||||
|
||||
now := model.Now()
|
||||
image = model.Image{
|
||||
ID: id,
|
||||
|
|
@ -155,11 +187,12 @@ func (d *Daemon) pullFromOCI(ctx context.Context, params api.ImagePullParams) (i
|
|||
CreatedAt: now,
|
||||
UpdatedAt: now,
|
||||
}
|
||||
if err := d.store.UpsertImage(ctx, image); err != nil {
|
||||
_ = os.RemoveAll(finalDir)
|
||||
published, err := d.publishImage(ctx, image, stagingDir, finalDir)
|
||||
if err != nil {
|
||||
return model.Image{}, err
|
||||
}
|
||||
return image, nil
|
||||
cleanupStaging = false
|
||||
return published, nil
|
||||
}
|
||||
|
||||
// pullFromBundle is the imagecat-backed path: download a ready-to-boot
|
||||
|
|
@ -218,11 +251,6 @@ func (d *Daemon) pullFromBundle(ctx context.Context, params api.ImagePullParams,
|
|||
return model.Image{}, fmt.Errorf("stage boot artifacts: %w", err)
|
||||
}
|
||||
|
||||
if err := os.Rename(stagingDir, finalDir); err != nil {
|
||||
return model.Image{}, fmt.Errorf("publish artifact dir: %w", err)
|
||||
}
|
||||
cleanupStaging = false
|
||||
|
||||
now := model.Now()
|
||||
image = model.Image{
|
||||
ID: id,
|
||||
|
|
@ -236,11 +264,12 @@ func (d *Daemon) pullFromBundle(ctx context.Context, params api.ImagePullParams,
|
|||
CreatedAt: now,
|
||||
UpdatedAt: now,
|
||||
}
|
||||
if err := d.store.UpsertImage(ctx, image); err != nil {
|
||||
_ = os.RemoveAll(finalDir)
|
||||
published, err := d.publishImage(ctx, image, stagingDir, finalDir)
|
||||
if err != nil {
|
||||
return model.Image{}, err
|
||||
}
|
||||
return image, nil
|
||||
cleanupStaging = false
|
||||
return published, nil
|
||||
}
|
||||
|
||||
// runBundleFetch is the seam tests substitute. nil → real implementation.
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue