daemon: shrink createVMMu + imageOpsMu to reservation/publication windows
Before: createVMMu was held across the whole of CreateVM — including
image resolution (which could fire a full auto-pull) and startVMLocked
(boot of multiple seconds). imageOpsMu was held across the whole of
PullImage/RegisterImage/PromoteImage/DeleteImage, so any slow OCI pull,
bundle download, or file copy blocked every other image mutation and
every other VM create that needed to auto-pull. The async create API
bought nothing if all creates serialised on the same mutex.
CreateVM is now three phases:
1. Validate + resolve image (possibly auto-pulling). No global lock.
2. reserveVM: take createVMMu only long enough to re-check the name
is free, allocate the next guest IP, and UpsertVM the "created"
row. Milliseconds.
3. startVMLocked: run the full boot flow under the per-VM lock only.
Parallel creates of different VMs now overlap on image resolution +
boot; they contend only across the reservation claim.
For the image surface a new publishImage helper isolates the commit
atom (recheck name free, atomic rename stagingDir→finalDir, UpsertImage)
under imageOpsMu. pullFromBundle + pullFromOCI do their network fetch
+ ext4 build + ownership fixup + agent injection outside the lock;
Register moves validation + kernel resolution outside; Promote moves
file copy + SSH-key seeding outside; Delete keeps a brief lock over
the lookup + reference check + store delete and does file cleanup
unlocked.
Two concurrency tests assert the new behaviour:
- TestPullImageDoesNotSerialiseOnDifferentNames fails the old code
(second pull blocks on imageOpsMu and never reaches the body).
- TestPullImageRejectsNameClashAtPublish confirms the publish-window
recheck is what enforces name uniqueness now that the body runs
unlocked — exactly one winner.
ARCHITECTURE.md updated to describe the new scope explicitly instead
of calling the locks "narrow".
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
afe91e805a
commit
99d0811097
5 changed files with 390 additions and 95 deletions
|
|
@ -2,6 +2,8 @@ package daemon
|
|||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
|
@ -13,9 +15,19 @@ import (
|
|||
"banger/internal/vmdns"
|
||||
)
|
||||
|
||||
// CreateVM is split into three phases so the global createVMMu guards
|
||||
// only the narrow name+IP reservation window, not the slow image
|
||||
// resolution or the multi-second boot flow:
|
||||
//
|
||||
// 1. Validate + resolve image. No global lock. Image auto-pull
|
||||
// self-locks via imageOpsMu (which is also now publication-only).
|
||||
// 2. Reserve a row: generate id, pick next IP, claim the name,
|
||||
// UpsertVM the "created" record. Held under createVMMu so two
|
||||
// concurrent `vm create --name foo` calls can't both think they
|
||||
// won.
|
||||
// 3. Boot. Only the per-VM lock is held — parallel creates against
|
||||
// different VMs fully overlap.
|
||||
func (d *Daemon) CreateVM(ctx context.Context, params api.VMCreateParams) (vm model.VMRecord, err error) {
|
||||
d.createVMMu.Lock()
|
||||
defer d.createVMMu.Unlock()
|
||||
op := d.beginOperation("vm.create")
|
||||
defer func() {
|
||||
if err != nil {
|
||||
|
|
@ -42,34 +54,7 @@ func (d *Daemon) CreateVM(ctx context.Context, params api.VMCreateParams) (vm mo
|
|||
}
|
||||
vmCreateStage(ctx, "resolve_image", "using image "+image.Name)
|
||||
op.stage("image_resolved", imageLogAttrs(image)...)
|
||||
name := strings.TrimSpace(params.Name)
|
||||
if name == "" {
|
||||
name, err = d.generateName(ctx)
|
||||
if err != nil {
|
||||
return model.VMRecord{}, err
|
||||
}
|
||||
}
|
||||
if _, err := d.FindVM(ctx, name); err == nil {
|
||||
return model.VMRecord{}, fmt.Errorf("vm name already exists: %s", name)
|
||||
}
|
||||
id, err := model.NewID()
|
||||
if err != nil {
|
||||
return model.VMRecord{}, err
|
||||
}
|
||||
unlockVM := d.lockVMID(id)
|
||||
defer unlockVM()
|
||||
guestIP, err := d.store.NextGuestIP(ctx, bridgePrefix(d.config.BridgeIP))
|
||||
if err != nil {
|
||||
return model.VMRecord{}, err
|
||||
}
|
||||
vmDir := filepath.Join(d.layout.VMsDir, id)
|
||||
if err := os.MkdirAll(vmDir, 0o755); err != nil {
|
||||
return model.VMRecord{}, err
|
||||
}
|
||||
vsockCID, err := defaultVSockCID(guestIP)
|
||||
if err != nil {
|
||||
return model.VMRecord{}, err
|
||||
}
|
||||
|
||||
systemOverlaySize := int64(model.DefaultSystemOverlaySize)
|
||||
if params.SystemOverlaySize != "" {
|
||||
systemOverlaySize, err = model.ParseSize(params.SystemOverlaySize)
|
||||
|
|
@ -84,7 +69,6 @@ func (d *Daemon) CreateVM(ctx context.Context, params api.VMCreateParams) (vm mo
|
|||
return model.VMRecord{}, err
|
||||
}
|
||||
}
|
||||
now := model.Now()
|
||||
spec := model.VMSpec{
|
||||
VCPUCount: optionalIntOrDefault(params.VCPUCount, model.DefaultVCPUCount),
|
||||
MemoryMiB: optionalIntOrDefault(params.MemoryMiB, model.DefaultMemoryMiB),
|
||||
|
|
@ -92,7 +76,69 @@ func (d *Daemon) CreateVM(ctx context.Context, params api.VMCreateParams) (vm mo
|
|||
WorkDiskSizeBytes: workDiskSize,
|
||||
NATEnabled: params.NATEnabled,
|
||||
}
|
||||
vm = model.VMRecord{
|
||||
|
||||
vm, err = d.reserveVM(ctx, strings.TrimSpace(params.Name), image, spec)
|
||||
if err != nil {
|
||||
return model.VMRecord{}, err
|
||||
}
|
||||
op.stage("persisted", vmLogAttrs(vm)...)
|
||||
vmCreateBindVM(ctx, vm)
|
||||
vmCreateStage(ctx, "reserve_vm", fmt.Sprintf("allocated %s (%s)", vm.Name, vm.Runtime.GuestIP))
|
||||
|
||||
unlockVM := d.lockVMID(vm.ID)
|
||||
defer unlockVM()
|
||||
|
||||
if params.NoStart {
|
||||
vm.State = model.VMStateStopped
|
||||
vm.Runtime.State = model.VMStateStopped
|
||||
if err := d.store.UpsertVM(ctx, vm); err != nil {
|
||||
return model.VMRecord{}, err
|
||||
}
|
||||
return vm, nil
|
||||
}
|
||||
return d.startVMLocked(ctx, vm, image)
|
||||
}
|
||||
|
||||
// reserveVM holds createVMMu only long enough to verify the name is
|
||||
// free, allocate a guest IP from the store, and persist the "created"
|
||||
// reservation row. Everything else (image resolution upstream, boot
|
||||
// downstream) runs outside this lock.
|
||||
func (d *Daemon) reserveVM(ctx context.Context, requestedName string, image model.Image, spec model.VMSpec) (model.VMRecord, error) {
|
||||
d.createVMMu.Lock()
|
||||
defer d.createVMMu.Unlock()
|
||||
|
||||
name := requestedName
|
||||
if name == "" {
|
||||
generated, err := d.generateName(ctx)
|
||||
if err != nil {
|
||||
return model.VMRecord{}, err
|
||||
}
|
||||
name = generated
|
||||
}
|
||||
if _, err := d.FindVM(ctx, name); err == nil {
|
||||
return model.VMRecord{}, fmt.Errorf("vm name already exists: %s", name)
|
||||
} else if !errors.Is(err, sql.ErrNoRows) && !strings.Contains(err.Error(), "not found") {
|
||||
return model.VMRecord{}, err
|
||||
}
|
||||
|
||||
id, err := model.NewID()
|
||||
if err != nil {
|
||||
return model.VMRecord{}, err
|
||||
}
|
||||
guestIP, err := d.store.NextGuestIP(ctx, bridgePrefix(d.config.BridgeIP))
|
||||
if err != nil {
|
||||
return model.VMRecord{}, err
|
||||
}
|
||||
vmDir := filepath.Join(d.layout.VMsDir, id)
|
||||
if err := os.MkdirAll(vmDir, 0o755); err != nil {
|
||||
return model.VMRecord{}, err
|
||||
}
|
||||
vsockCID, err := defaultVSockCID(guestIP)
|
||||
if err != nil {
|
||||
return model.VMRecord{}, err
|
||||
}
|
||||
now := model.Now()
|
||||
vm := model.VMRecord{
|
||||
ID: id,
|
||||
Name: name,
|
||||
ImageID: image.ID,
|
||||
|
|
@ -114,21 +160,10 @@ func (d *Daemon) CreateVM(ctx context.Context, params api.VMCreateParams) (vm mo
|
|||
MetricsPath: filepath.Join(vmDir, "metrics.json"),
|
||||
},
|
||||
}
|
||||
vmCreateBindVM(ctx, vm)
|
||||
vmCreateStage(ctx, "reserve_vm", fmt.Sprintf("allocated %s (%s)", vm.Name, vm.Runtime.GuestIP))
|
||||
if err := d.store.UpsertVM(ctx, vm); err != nil {
|
||||
return model.VMRecord{}, err
|
||||
}
|
||||
op.stage("persisted", vmLogAttrs(vm)...)
|
||||
if params.NoStart {
|
||||
vm.State = model.VMStateStopped
|
||||
vm.Runtime.State = model.VMStateStopped
|
||||
if err := d.store.UpsertVM(ctx, vm); err != nil {
|
||||
return model.VMRecord{}, err
|
||||
}
|
||||
return vm, nil
|
||||
}
|
||||
return d.startVMLocked(ctx, vm, image)
|
||||
return vm, nil
|
||||
}
|
||||
|
||||
// findOrAutoPullImage tries the local image store first; if the name
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue