banger/internal/daemon/vm_create.go
Thales Maciel 72882e45d7
daemon: serialise concurrent image/kernel pulls + atomic-rename seed refresh
Three concurrency bugs surfaced by `make smoke JOBS=4` that all stem
from `vm.create` paths assuming single-caller semantics:

1. **Kernel auto-pull manifest race.** Parallel `vm.create` calls that
   each need to auto-pull the same kernel ref both run kernelcat.Fetch
   in parallel against the same /var/lib/banger/kernels/<name>/. Fetch
   writes manifest.json non-atomically (truncate + write); the peer
   reads it back mid-write and trips
   "parse manifest for X: unexpected end of JSON input".

   Fix: per-name `sync.Mutex` map on `ImageService` (kernelPullLock).
   `KernelPull` and `readOrAutoPullKernel` both acquire it and re-check
   `kernelcat.ReadLocal` after the lock so a peer who finished while we
   waited is treated as success — `readOrAutoPullKernel` does NOT call
   `s.KernelPull` because that path errors with "already pulled" on a
   peer-success, which would be wrong for auto-pull. Different kernels
   stay parallel.

2. **Image auto-pull race.** Same shape as the kernel race but on the
   image side: parallel `vm.create` calls both run pullFromBundle /
   pullFromOCI for the missing image (each ~minutes of OCI fetch +
   ext4 build). The publishImage atom under imageOpsMu only protects
   the rename + UpsertImage commit, so the loser does all the work
   only to fail at the recheck with "image already exists".

   Fix: per-name `sync.Mutex` map on `ImageService` (imagePullLock).
   `findOrAutoPullImage` acquires it, re-checks FindImage, and only
   then calls PullImage. Loser short-circuits with the
   freshly-published image instead of redoing minutes of work.
   PullImage's own publishImage recheck stays as defense-in-depth
   for callers that bypass the auto-pull path.

3. **Work-seed refresh race.** When the host's SSH key has rotated
   since an image was last refreshed, `ensureAuthorizedKeyOnWorkDisk`
   triggers `refreshManagedWorkSeedFingerprint`, which rewrote the
   shared work-seed.ext4 in place via e2rm + e2cp. Peer `vm.create`
   calls doing parallel `MaterializeWorkDisk` rdumps observed a torn
   ext4 image — "Superblock checksum does not match superblock".

   Fix: stage the rewrite on a sibling tmpfile (`<seed>.refresh.<pid>-<ns>.tmp`)
   and atomic-rename. Concurrent readers either have the file open
   (kernel keeps the pre-rename inode alive) or open after the rename
   (see the new inode) — never observe a partial state. Two parallel
   refreshes are idempotent (same daemon, same SSH key) so unique tmp
   names are enough; whichever rename lands last wins, with identical
   content. UpsertImage runs after the rename so the recorded
   fingerprint always matches what's on disk.

Plus one smoke harness fix: reclassify `vm_prune` from `pure` to
`global`. `vm prune -f` removes ALL stopped VMs system-wide, not just
the ones the scenario created — so a parallel peer scenario that
happens to have its VM in `created`/`stopped` momentarily gets wiped.
Moving prune to the post-pool serial phase keeps it from racing with
in-flight scenarios.

After all four fixes, `make smoke JOBS=4` passes 21/21 in 174s
(serial baseline 141s; the small overhead is the buffered-output and
`wait -n` semaphore cost — well worth the parallelism for fast-iter
work on a 32-core box).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-27 17:24:11 -03:00

224 lines
7.5 KiB
Go

package daemon
import (
"context"
"database/sql"
"errors"
"fmt"
"os"
"path/filepath"
"strings"
"banger/internal/api"
"banger/internal/imagecat"
"banger/internal/model"
"banger/internal/vmdns"
)
// CreateVM is split into three phases so the global createVMMu guards
// only the narrow name+IP reservation window, not the slow image
// resolution or the multi-second boot flow:
//
// 1. Validate + resolve image. No global lock. Image auto-pull
// self-locks via imageOpsMu (which is also now publication-only).
// 2. Reserve a row: generate id, pick next IP, claim the name,
// UpsertVM the "created" record. Held under createVMMu so two
// concurrent `vm create --name foo` calls can't both think they
// won.
// 3. Boot. Only the per-VM lock is held — parallel creates against
// different VMs fully overlap.
func (s *VMService) CreateVM(ctx context.Context, params api.VMCreateParams) (vm model.VMRecord, err error) {
op := s.beginOperation(ctx, "vm.create")
defer func() {
if err != nil {
op.fail(err)
return
}
op.done(vmLogAttrs(vm)...)
}()
if err := validateOptionalPositiveSetting("vcpu", params.VCPUCount); err != nil {
return model.VMRecord{}, err
}
if err := validateOptionalPositiveSetting("memory", params.MemoryMiB); err != nil {
return model.VMRecord{}, err
}
imageName := params.ImageName
if imageName == "" {
imageName = s.config.DefaultImageName
}
vmCreateStage(ctx, "resolve_image", "resolving image")
image, err := s.findOrAutoPullImage(ctx, imageName)
if err != nil {
return model.VMRecord{}, err
}
vmCreateStage(ctx, "resolve_image", "using image "+image.Name)
op.stage("image_resolved", imageLogAttrs(image)...)
systemOverlaySize := int64(model.DefaultSystemOverlaySize)
if params.SystemOverlaySize != "" {
systemOverlaySize, err = model.ParseSize(params.SystemOverlaySize)
if err != nil {
return model.VMRecord{}, err
}
}
workDiskSize := int64(model.DefaultWorkDiskSize)
if params.WorkDiskSize != "" {
workDiskSize, err = model.ParseSize(params.WorkDiskSize)
if err != nil {
return model.VMRecord{}, err
}
}
spec := model.VMSpec{
VCPUCount: optionalIntOrDefault(params.VCPUCount, model.DefaultVCPUCount),
MemoryMiB: optionalIntOrDefault(params.MemoryMiB, model.DefaultMemoryMiB),
SystemOverlaySizeByte: systemOverlaySize,
WorkDiskSizeBytes: workDiskSize,
NATEnabled: params.NATEnabled,
}
vm, err = s.reserveVM(ctx, strings.TrimSpace(params.Name), image, spec)
if err != nil {
return model.VMRecord{}, err
}
op.stage("persisted", vmLogAttrs(vm)...)
vmCreateBindVM(ctx, vm)
vmCreateStage(ctx, "reserve_vm", fmt.Sprintf("allocated %s (%s)", vm.Name, vm.Runtime.GuestIP))
unlockVM := s.lockVMID(vm.ID)
defer unlockVM()
if params.NoStart {
vm.State = model.VMStateStopped
vm.Runtime.State = model.VMStateStopped
if err := s.store.UpsertVM(ctx, vm); err != nil {
return model.VMRecord{}, err
}
return vm, nil
}
return s.startVMLocked(ctx, vm, image)
}
// reserveVM holds createVMMu only long enough to verify the name is
// free, allocate a guest IP from the store, and persist the "created"
// reservation row. Everything else (image resolution upstream, boot
// downstream) runs outside this lock.
func (s *VMService) reserveVM(ctx context.Context, requestedName string, image model.Image, spec model.VMSpec) (model.VMRecord, error) {
s.createVMMu.Lock()
defer s.createVMMu.Unlock()
name := requestedName
if name == "" {
generated, err := s.generateName(ctx)
if err != nil {
return model.VMRecord{}, err
}
name = generated
}
// Defense in depth: CLI has already validated the flag, but any
// other RPC caller (SDK, direct JSON over the socket) lands here
// without going through the CLI flag parser. The name flows into
// /etc/hostname, kernel boot args, DNS records, and file paths —
// it has to be DNS-label-safe.
if err := model.ValidateVMName(name); err != nil {
return model.VMRecord{}, err
}
// Exact-name lookup. Using FindVM here would also match a new name
// that merely prefixes some existing VM's id or another VM's name,
// falsely rejecting perfectly valid names.
if _, err := s.store.GetVMByName(ctx, name); err == nil {
return model.VMRecord{}, fmt.Errorf("vm name already exists: %s", name)
} else if !errors.Is(err, sql.ErrNoRows) {
return model.VMRecord{}, err
}
id, err := model.NewID()
if err != nil {
return model.VMRecord{}, err
}
guestIP, err := s.store.NextGuestIP(ctx, bridgePrefix(s.config.BridgeIP))
if err != nil {
return model.VMRecord{}, err
}
vmDir := filepath.Join(s.layout.VMsDir, id)
if err := os.MkdirAll(vmDir, 0o755); err != nil {
return model.VMRecord{}, err
}
vsockCID, err := defaultVSockCID(guestIP)
if err != nil {
return model.VMRecord{}, err
}
now := model.Now()
vm := model.VMRecord{
ID: id,
Name: name,
ImageID: image.ID,
State: model.VMStateCreated,
CreatedAt: now,
UpdatedAt: now,
LastTouchedAt: now,
Spec: spec,
Runtime: model.VMRuntime{
State: model.VMStateCreated,
GuestIP: guestIP,
DNSName: vmdns.RecordName(name),
VMDir: vmDir,
VSockPath: defaultVSockPath(s.layout.RuntimeDir, id),
VSockCID: vsockCID,
SystemOverlay: filepath.Join(vmDir, "system.cow"),
WorkDiskPath: filepath.Join(vmDir, "root.ext4"),
LogPath: filepath.Join(vmDir, "firecracker.log"),
MetricsPath: filepath.Join(vmDir, "metrics.json"),
},
}
if err := s.store.UpsertVM(ctx, vm); err != nil {
return model.VMRecord{}, err
}
return vm, nil
}
// findOrAutoPullImage tries the local image store first; if the name
// isn't registered but matches an entry in the embedded imagecat
// catalog, it auto-pulls the bundle so `vm create --image foo` (and
// therefore `vm run`) works on a fresh host without the user having
// to run `image pull` first.
//
// Concurrency: parallel vm.create RPCs targeting the same missing
// image must not both run the full OCI fetch + ext4 build. The pull
// itself takes minutes, and the publishImage atom that closes it
// only protects the rename + upsert — by the time the second caller
// gets there, it has already done all the work, only to fail at the
// recheck with "image already exists". Hold a per-name pull lock
// around the recheck-and-pull section: the loser waits, sees the
// image already published on the post-lock recheck, and short-
// circuits with a FindImage. PullImage's own internal recheck stays
// in place as defense-in-depth for callers that bypass this path.
func (s *VMService) findOrAutoPullImage(ctx context.Context, idOrName string) (model.Image, error) {
if image, err := s.img.FindImage(ctx, idOrName); err == nil {
return image, nil
}
catalog, loadErr := imagecat.LoadEmbedded()
if loadErr != nil {
_, err := s.img.FindImage(ctx, idOrName)
return model.Image{}, err
}
entry, lookupErr := catalog.Lookup(idOrName)
if lookupErr != nil {
// Not in the catalog either — surface the original not-found.
_, err := s.img.FindImage(ctx, idOrName)
return model.Image{}, err
}
lock := s.img.imagePullLock(entry.Name)
lock.Lock()
defer lock.Unlock()
if image, err := s.img.FindImage(ctx, idOrName); err == nil {
return image, nil
}
vmCreateStage(ctx, "auto_pull_image", fmt.Sprintf("pulling %s from image catalog", entry.Name))
if _, pullErr := s.img.PullImage(ctx, api.ImagePullParams{Ref: entry.Name}); pullErr != nil {
return model.Image{}, fmt.Errorf("auto-pull image %q: %w", entry.Name, pullErr)
}
return s.img.FindImage(ctx, idOrName)
}