banger/internal/daemon/images.go
Thales Maciel 72882e45d7
daemon: serialise concurrent image/kernel pulls + atomic-rename seed refresh
Three concurrency bugs surfaced by `make smoke JOBS=4` that all stem
from `vm.create` paths assuming single-caller semantics:

1. **Kernel auto-pull manifest race.** Parallel `vm.create` calls that
   each need to auto-pull the same kernel ref both run kernelcat.Fetch
   in parallel against the same /var/lib/banger/kernels/<name>/. Fetch
   writes manifest.json non-atomically (truncate + write); the peer
   reads it back mid-write and trips
   "parse manifest for X: unexpected end of JSON input".

   Fix: per-name `sync.Mutex` map on `ImageService` (kernelPullLock).
   `KernelPull` and `readOrAutoPullKernel` both acquire it and re-check
   `kernelcat.ReadLocal` after the lock so a peer who finished while we
   waited is treated as success — `readOrAutoPullKernel` does NOT call
   `s.KernelPull` because that path errors with "already pulled" on a
   peer-success, which would be wrong for auto-pull. Different kernels
   stay parallel.

2. **Image auto-pull race.** Same shape as the kernel race but on the
   image side: parallel `vm.create` calls both run pullFromBundle /
   pullFromOCI for the missing image (each ~minutes of OCI fetch +
   ext4 build). The publishImage atom under imageOpsMu only protects
   the rename + UpsertImage commit, so the loser does all the work
   only to fail at the recheck with "image already exists".

   Fix: per-name `sync.Mutex` map on `ImageService` (imagePullLock).
   `findOrAutoPullImage` acquires it, re-checks FindImage, and only
   then calls PullImage. Loser short-circuits with the
   freshly-published image instead of redoing minutes of work.
   PullImage's own publishImage recheck stays as defense-in-depth
   for callers that bypass the auto-pull path.

3. **Work-seed refresh race.** When the host's SSH key has rotated
   since an image was last refreshed, `ensureAuthorizedKeyOnWorkDisk`
   triggers `refreshManagedWorkSeedFingerprint`, which rewrote the
   shared work-seed.ext4 in place via e2rm + e2cp. Peer `vm.create`
   calls doing parallel `MaterializeWorkDisk` rdumps observed a torn
   ext4 image — "Superblock checksum does not match superblock".

   Fix: stage the rewrite on a sibling tmpfile (`<seed>.refresh.<pid>-<ns>.tmp`)
   and atomic-rename. Concurrent readers either have the file open
   (kernel keeps the pre-rename inode alive) or open after the rename
   (see the new inode) — never observe a partial state. Two parallel
   refreshes are idempotent (same daemon, same SSH key) so unique tmp
   names are enough; whichever rename lands last wins, with identical
   content. UpsertImage runs after the rename so the recorded
   fingerprint always matches what's on disk.

Plus one smoke harness fix: reclassify `vm_prune` from `pure` to
`global`. `vm prune -f` removes ALL stopped VMs system-wide, not just
the ones the scenario created — so a parallel peer scenario that
happens to have its VM in `created`/`stopped` momentarily gets wiped.
Moving prune to the post-pool serial phase keeps it from racing with
in-flight scenarios.

After all four fixes, `make smoke JOBS=4` passes 21/21 in 174s
(serial baseline 141s; the small overhead is the buffered-output and
`wait -n` semaphore cost — well worth the parallelism for fast-iter
work on a 32-core box).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-27 17:24:11 -03:00

315 lines
10 KiB
Go

package daemon
import (
"context"
"database/sql"
"errors"
"fmt"
"os"
"path/filepath"
"strings"
"banger/internal/api"
"banger/internal/daemon/imagemgr"
"banger/internal/kernelcat"
"banger/internal/model"
"banger/internal/system"
)
// RegisterImage creates or updates an unmanaged image row. Path
// validation + kernel resolution run without imageOpsMu — only the
// lookup-then-upsert atom is held under the lock so concurrent
// registers of the same name don't race.
func (s *ImageService) RegisterImage(ctx context.Context, params api.ImageRegisterParams) (image model.Image, err error) {
name := strings.TrimSpace(params.Name)
if name == "" {
return model.Image{}, fmt.Errorf("image name is required")
}
rootfsPath := strings.TrimSpace(params.RootfsPath)
if rootfsPath == "" {
return model.Image{}, fmt.Errorf("rootfs path is required")
}
workSeedPath := strings.TrimSpace(params.WorkSeedPath)
if workSeedPath == "" {
candidate := system.WorkSeedPath(rootfsPath)
if candidate != "" {
if _, statErr := os.Stat(candidate); statErr == nil {
workSeedPath = candidate
}
}
}
kernelPath, initrdPath, modulesDir, err := s.resolveKernelInputs(ctx, params.KernelRef, params.KernelPath, params.InitrdPath, params.ModulesDir)
if err != nil {
return model.Image{}, err
}
if err := imagemgr.ValidateRegisterPaths(rootfsPath, workSeedPath, kernelPath, initrdPath, modulesDir); err != nil {
return model.Image{}, err
}
s.imageOpsMu.Lock()
defer s.imageOpsMu.Unlock()
now := model.Now()
existing, lookupErr := s.store.GetImageByName(ctx, name)
switch {
case lookupErr == nil:
if existing.Managed {
return model.Image{}, fmt.Errorf("managed image %s cannot be updated via register", name)
}
image = existing
image.RootfsPath = rootfsPath
image.WorkSeedPath = workSeedPath
image.KernelPath = kernelPath
image.InitrdPath = initrdPath
image.ModulesDir = modulesDir
image.UpdatedAt = now
case errors.Is(lookupErr, sql.ErrNoRows):
id, idErr := model.NewID()
if idErr != nil {
return model.Image{}, idErr
}
image = model.Image{
ID: id,
Name: name,
Managed: false,
RootfsPath: rootfsPath,
WorkSeedPath: workSeedPath,
KernelPath: kernelPath,
InitrdPath: initrdPath,
ModulesDir: modulesDir,
CreatedAt: now,
UpdatedAt: now,
}
default:
return model.Image{}, lookupErr
}
if err := s.store.UpsertImage(ctx, image); err != nil {
return model.Image{}, err
}
return image, nil
}
// PromoteImage copies an unmanaged image's files into the managed
// artifacts dir and flips its managed bit. The expensive file copy,
// SSH-key seeding, and boot-artifact staging all happen outside
// imageOpsMu — only the find/rename/upsert commit atom holds the
// lock.
func (s *ImageService) PromoteImage(ctx context.Context, idOrName string) (image model.Image, err error) {
op := s.beginOperation(ctx, "image.promote")
defer func() {
if err != nil {
op.fail(err, imageLogAttrs(image)...)
return
}
op.done(imageLogAttrs(image)...)
}()
image, err = s.FindImage(ctx, idOrName)
if err != nil {
return model.Image{}, err
}
if image.Managed {
return model.Image{}, fmt.Errorf("image %s is already managed", image.Name)
}
if err := imagemgr.ValidatePromotePaths(image.RootfsPath, image.KernelPath, image.InitrdPath, image.ModulesDir); err != nil {
return model.Image{}, err
}
if strings.TrimSpace(s.layout.ImagesDir) == "" {
return model.Image{}, errors.New("images dir is not configured")
}
if err := os.MkdirAll(s.layout.ImagesDir, 0o755); err != nil {
return model.Image{}, err
}
artifactDir := filepath.Join(s.layout.ImagesDir, image.ID)
if _, statErr := os.Stat(artifactDir); statErr == nil {
return model.Image{}, fmt.Errorf("artifact dir already exists: %s", artifactDir)
} else if !os.IsNotExist(statErr) {
return model.Image{}, statErr
}
stageDir, err := os.MkdirTemp(s.layout.ImagesDir, image.ID+".promote-")
if err != nil {
return model.Image{}, err
}
cleanupStage := true
defer func() {
if cleanupStage {
_ = os.RemoveAll(stageDir)
}
}()
rootfsPath := filepath.Join(stageDir, "rootfs.ext4")
op.stage("copy_rootfs", "source_rootfs_path", image.RootfsPath, "target_rootfs_path", rootfsPath)
if err := system.CopyFilePreferClone(image.RootfsPath, rootfsPath); err != nil {
return model.Image{}, err
}
workSeedPath := ""
if image.WorkSeedPath != "" {
if _, statErr := os.Stat(image.WorkSeedPath); statErr != nil {
if os.IsNotExist(statErr) {
op.stage("skip_missing_work_seed", "source_work_seed_path", image.WorkSeedPath)
image.WorkSeedPath = ""
} else {
return model.Image{}, statErr
}
}
}
if image.WorkSeedPath != "" {
workSeedPath = filepath.Join(stageDir, "work-seed.ext4")
op.stage("copy_work_seed", "source_work_seed_path", image.WorkSeedPath, "target_work_seed_path", workSeedPath)
if err := system.CopyFilePreferClone(image.WorkSeedPath, workSeedPath); err != nil {
return model.Image{}, err
}
image.SeededSSHPublicKeyFingerprint, err = s.seedAuthorizedKeyOnExt4Image(ctx, workSeedPath)
if err != nil {
return model.Image{}, err
}
} else {
image.SeededSSHPublicKeyFingerprint = ""
}
_, initrdPath, modulesDir, err := imagemgr.StageBootArtifacts(ctx, s.runner, stageDir, image.KernelPath, image.InitrdPath, image.ModulesDir)
if err != nil {
return model.Image{}, err
}
image.Managed = true
image.ArtifactDir = artifactDir
image.RootfsPath = filepath.Join(artifactDir, "rootfs.ext4")
if workSeedPath != "" {
image.WorkSeedPath = filepath.Join(artifactDir, "work-seed.ext4")
}
image.KernelPath = filepath.Join(artifactDir, "kernel")
image.InitrdPath = imagemgr.StageOptionalArtifactPath(artifactDir, initrdPath, "initrd.img")
image.ModulesDir = imagemgr.StageOptionalArtifactPath(artifactDir, modulesDir, "modules")
image.UpdatedAt = model.Now()
op.stage("activate_artifacts", "artifact_dir", artifactDir)
s.imageOpsMu.Lock()
defer s.imageOpsMu.Unlock()
if err := os.Rename(stageDir, artifactDir); err != nil {
return model.Image{}, err
}
cleanupStage = false
if err := s.store.UpsertImage(ctx, image); err != nil {
_ = os.RemoveAll(artifactDir)
return model.Image{}, err
}
return image, nil
}
// DeleteImage runs the lookup + reference check + store delete under
// imageOpsMu so a concurrent CreateVM can't slip an image_id reference
// in between the check and the delete. File cleanup happens after the
// lock is released — the store row is the authoritative handle.
func (s *ImageService) DeleteImage(ctx context.Context, idOrName string) (model.Image, error) {
image, err := func() (model.Image, error) {
s.imageOpsMu.Lock()
defer s.imageOpsMu.Unlock()
img, err := s.FindImage(ctx, idOrName)
if err != nil {
return model.Image{}, err
}
vms, err := s.store.FindVMsUsingImage(ctx, img.ID)
if err != nil {
return model.Image{}, err
}
if len(vms) > 0 {
return model.Image{}, fmt.Errorf("image %s is still referenced by %d VM(s)", img.Name, len(vms))
}
if err := s.store.DeleteImage(ctx, img.ID); err != nil {
return model.Image{}, err
}
return img, nil
}()
if err != nil {
return model.Image{}, err
}
if image.Managed && image.ArtifactDir != "" {
if err := os.RemoveAll(image.ArtifactDir); err != nil {
return model.Image{}, err
}
}
return image, nil
}
func firstNonEmpty(values ...string) string {
for _, value := range values {
if strings.TrimSpace(value) != "" {
return value
}
}
return ""
}
// resolveKernelInputs canonicalises user-supplied kernel info: either direct
// paths or a kernel-catalog ref. Shared by RegisterImage and PullImage.
// When kernelRef is given but not yet pulled locally, an auto-pull from the
// embedded kernelcat catalog fires so the caller doesn't have to manage
// kernel/image ordering by hand.
func (s *ImageService) resolveKernelInputs(ctx context.Context, kernelRef, kernelPath, initrdPath, modulesDir string) (string, string, string, error) {
kernelRef = strings.TrimSpace(kernelRef)
kernelPath = strings.TrimSpace(kernelPath)
initrdPath = strings.TrimSpace(initrdPath)
modulesDir = strings.TrimSpace(modulesDir)
if kernelRef != "" {
if kernelPath != "" || initrdPath != "" || modulesDir != "" {
return "", "", "", fmt.Errorf("--kernel-ref is mutually exclusive with --kernel/--initrd/--modules")
}
entry, err := s.readOrAutoPullKernel(ctx, kernelRef)
if err != nil {
return "", "", "", err
}
return entry.KernelPath, entry.InitrdPath, entry.ModulesDir, nil
}
if kernelPath == "" {
return "", "", "", fmt.Errorf("kernel path is required (pass --kernel <path> or --kernel-ref <name>)")
}
return kernelPath, initrdPath, modulesDir, nil
}
// readOrAutoPullKernel tries the local kernelcat first; on miss, checks
// the embedded catalog and auto-pulls the bundle.
//
// Concurrency-safe: takes the same per-name pull lock as KernelPull and
// re-checks ReadLocal after acquiring it. If a peer finished the pull
// while we were waiting, the re-check returns the freshly-pulled entry
// — we explicitly do NOT call s.KernelPull from here because that path
// errors with "already pulled" on a successful peer-pull. Auto-pull's
// contract is "make sure this kernel is local"; "someone beat me to it"
// is success, not failure.
func (s *ImageService) readOrAutoPullKernel(ctx context.Context, kernelRef string) (kernelcat.Entry, error) {
if entry, err := kernelcat.ReadLocal(s.layout.KernelsDir, kernelRef); err == nil {
return entry, nil
} else if !os.IsNotExist(err) {
return kernelcat.Entry{}, fmt.Errorf("resolve kernel %q: %w", kernelRef, err)
}
catalog, loadErr := kernelcat.LoadEmbedded()
if loadErr != nil {
return kernelcat.Entry{}, fmt.Errorf("kernel %q not found locally: %w", kernelRef, loadErr)
}
catEntry, lookupErr := catalog.Lookup(kernelRef)
if lookupErr != nil {
return kernelcat.Entry{}, fmt.Errorf("kernel %q not found in catalog; run 'banger kernel list --available' to browse", kernelRef)
}
lock := s.kernelPullLock(kernelRef)
lock.Lock()
defer lock.Unlock()
if entry, err := kernelcat.ReadLocal(s.layout.KernelsDir, kernelRef); err == nil {
return entry, nil
}
vmCreateStage(ctx, "auto_pull_kernel", fmt.Sprintf("pulling kernel %s from catalog", kernelRef))
stored, err := kernelcat.Fetch(ctx, nil, s.layout.KernelsDir, catEntry)
if err != nil {
return kernelcat.Entry{}, fmt.Errorf("auto-pull kernel %q: %w", kernelRef, err)
}
return stored, nil
}