package daemon import ( "context" "database/sql" "errors" "fmt" "os" "path/filepath" "strings" "banger/internal/api" "banger/internal/imagecat" "banger/internal/model" "banger/internal/vmdns" ) // CreateVM is split into three phases so the global createVMMu guards // only the narrow name+IP reservation window, not the slow image // resolution or the multi-second boot flow: // // 1. Validate + resolve image. No global lock. Image auto-pull // self-locks via imageOpsMu (which is also now publication-only). // 2. Reserve a row: generate id, pick next IP, claim the name, // UpsertVM the "created" record. Held under createVMMu so two // concurrent `vm create --name foo` calls can't both think they // won. // 3. Boot. Only the per-VM lock is held — parallel creates against // different VMs fully overlap. func (s *VMService) CreateVM(ctx context.Context, params api.VMCreateParams) (vm model.VMRecord, err error) { op := s.beginOperation(ctx, "vm.create") defer func() { if err != nil { op.fail(err) return } op.done(vmLogAttrs(vm)...) }() if err := validateOptionalPositiveSetting("vcpu", params.VCPUCount); err != nil { return model.VMRecord{}, err } if err := validateOptionalPositiveSetting("memory", params.MemoryMiB); err != nil { return model.VMRecord{}, err } imageName := params.ImageName if imageName == "" { imageName = s.config.DefaultImageName } vmCreateStage(ctx, "resolve_image", "resolving image") image, err := s.findOrAutoPullImage(ctx, imageName) if err != nil { return model.VMRecord{}, err } vmCreateStage(ctx, "resolve_image", "using image "+image.Name) op.stage("image_resolved", imageLogAttrs(image)...) systemOverlaySize := int64(model.DefaultSystemOverlaySize) if params.SystemOverlaySize != "" { systemOverlaySize, err = model.ParseSize(params.SystemOverlaySize) if err != nil { return model.VMRecord{}, err } } workDiskSize := int64(model.DefaultWorkDiskSize) if params.WorkDiskSize != "" { workDiskSize, err = model.ParseSize(params.WorkDiskSize) if err != nil { return model.VMRecord{}, err } } spec := model.VMSpec{ VCPUCount: optionalIntOrDefault(params.VCPUCount, model.DefaultVCPUCount), MemoryMiB: optionalIntOrDefault(params.MemoryMiB, model.DefaultMemoryMiB), SystemOverlaySizeByte: systemOverlaySize, WorkDiskSizeBytes: workDiskSize, NATEnabled: params.NATEnabled, } vm, err = s.reserveVM(ctx, strings.TrimSpace(params.Name), image, spec) if err != nil { return model.VMRecord{}, err } op.stage("persisted", vmLogAttrs(vm)...) vmCreateBindVM(ctx, vm) vmCreateStage(ctx, "reserve_vm", fmt.Sprintf("allocated %s (%s)", vm.Name, vm.Runtime.GuestIP)) unlockVM := s.lockVMID(vm.ID) defer unlockVM() if params.NoStart { vm.State = model.VMStateStopped vm.Runtime.State = model.VMStateStopped if err := s.store.UpsertVM(ctx, vm); err != nil { return model.VMRecord{}, err } return vm, nil } return s.startVMLocked(ctx, vm, image) } // reserveVM holds createVMMu only long enough to verify the name is // free, allocate a guest IP from the store, and persist the "created" // reservation row. Everything else (image resolution upstream, boot // downstream) runs outside this lock. func (s *VMService) reserveVM(ctx context.Context, requestedName string, image model.Image, spec model.VMSpec) (model.VMRecord, error) { s.createVMMu.Lock() defer s.createVMMu.Unlock() name := requestedName if name == "" { generated, err := s.generateName(ctx) if err != nil { return model.VMRecord{}, err } name = generated } // Defense in depth: CLI has already validated the flag, but any // other RPC caller (SDK, direct JSON over the socket) lands here // without going through the CLI flag parser. The name flows into // /etc/hostname, kernel boot args, DNS records, and file paths — // it has to be DNS-label-safe. if err := model.ValidateVMName(name); err != nil { return model.VMRecord{}, err } // Exact-name lookup. Using FindVM here would also match a new name // that merely prefixes some existing VM's id or another VM's name, // falsely rejecting perfectly valid names. if _, err := s.store.GetVMByName(ctx, name); err == nil { return model.VMRecord{}, fmt.Errorf("vm name already exists: %s", name) } else if !errors.Is(err, sql.ErrNoRows) { return model.VMRecord{}, err } id, err := model.NewID() if err != nil { return model.VMRecord{}, err } guestIP, err := s.store.NextGuestIP(ctx, bridgePrefix(s.config.BridgeIP)) if err != nil { return model.VMRecord{}, err } vmDir := filepath.Join(s.layout.VMsDir, id) if err := os.MkdirAll(vmDir, 0o755); err != nil { return model.VMRecord{}, err } vsockCID, err := defaultVSockCID(guestIP) if err != nil { return model.VMRecord{}, err } now := model.Now() vm := model.VMRecord{ ID: id, Name: name, ImageID: image.ID, State: model.VMStateCreated, CreatedAt: now, UpdatedAt: now, LastTouchedAt: now, Spec: spec, Runtime: model.VMRuntime{ State: model.VMStateCreated, GuestIP: guestIP, DNSName: vmdns.RecordName(name), VMDir: vmDir, VSockPath: defaultVSockPath(s.layout.RuntimeDir, id), VSockCID: vsockCID, SystemOverlay: filepath.Join(vmDir, "system.cow"), WorkDiskPath: filepath.Join(vmDir, "root.ext4"), LogPath: filepath.Join(vmDir, "firecracker.log"), MetricsPath: filepath.Join(vmDir, "metrics.json"), }, } if err := s.store.UpsertVM(ctx, vm); err != nil { return model.VMRecord{}, err } return vm, nil } // findOrAutoPullImage tries the local image store first; if the name // isn't registered but matches an entry in the embedded imagecat // catalog, it auto-pulls the bundle so `vm create --image foo` (and // therefore `vm run`) works on a fresh host without the user having // to run `image pull` first. // // Concurrency: parallel vm.create RPCs targeting the same missing // image must not both run the full OCI fetch + ext4 build. The pull // itself takes minutes, and the publishImage atom that closes it // only protects the rename + upsert — by the time the second caller // gets there, it has already done all the work, only to fail at the // recheck with "image already exists". Hold a per-name pull lock // around the recheck-and-pull section: the loser waits, sees the // image already published on the post-lock recheck, and short- // circuits with a FindImage. PullImage's own internal recheck stays // in place as defense-in-depth for callers that bypass this path. func (s *VMService) findOrAutoPullImage(ctx context.Context, idOrName string) (model.Image, error) { if image, err := s.img.FindImage(ctx, idOrName); err == nil { return image, nil } catalog, loadErr := imagecat.LoadEmbedded() if loadErr != nil { _, err := s.img.FindImage(ctx, idOrName) return model.Image{}, err } entry, lookupErr := catalog.Lookup(idOrName) if lookupErr != nil { // Not in the catalog either — surface the original not-found. _, err := s.img.FindImage(ctx, idOrName) return model.Image{}, err } release, err := s.img.acquireImagePullLock(ctx, entry.Name) if err != nil { return model.Image{}, err } defer release() if image, err := s.img.FindImage(ctx, idOrName); err == nil { return image, nil } vmCreateStage(ctx, "auto_pull_image", fmt.Sprintf("pulling %s from image catalog", entry.Name)) if _, pullErr := s.img.PullImage(ctx, api.ImagePullParams{Ref: entry.Name}); pullErr != nil { return model.Image{}, fmt.Errorf("auto-pull image %q: %w", entry.Name, pullErr) } return s.img.FindImage(ctx, idOrName) }