workspace prepare: release VM mutex before guest I/O

Previously withVMLockByRef held the per-VM mutex across InspectRepo,
waitForGuestSSH, dialGuest, ImportRepoToGuest (the tar stream!), and
the readonly chmod. A large repo could block `vm stop` / `vm delete`
/ `vm restart` on the same VM for however long the import took.

Split into two phases:

  1. VM mutex held briefly to validate state (running + PID alive)
     and snapshot the fields needed for SSH (guest IP, api sock).
  2. VM mutex released. Acquire workspaceLocks[id] — a separate
     per-VM mutex scoped to workspace.prepare / workspace.export —
     for the guest I/O phase.

Lifecycle ops (stop/delete/restart/set) only take vmLocks, so they
no longer queue behind a slow import. Two concurrent prepares on the
same VM still serialise via workspaceLocks so tar streams don't
interleave. ExportVMWorkspace also acquires workspaceLocks to avoid
snapshotting a half-streamed import.

Two regression tests (sequential — they swap package-level seams):

  ReleasesVMLockDuringGuestIO: stall the import fake, assert the VM
  mutex is acquirable from another goroutine during the stall.

  SerialisesConcurrentPreparesOnSameVM: 3 concurrent prepares, assert
  Import is only ever invoked 1-at-a-time per VM.

ARCHITECTURE.md documents the split + updated lock ordering.
This commit is contained in:
Thales Maciel 2026-04-19 13:32:42 -03:00
parent 99de42385f
commit 6cd52d12f4
No known key found for this signature in database
GPG key ID: 33112E6833C34679
4 changed files with 265 additions and 22 deletions

View file

@ -16,6 +16,14 @@ import (
"banger/internal/system"
)
// Test seams. Tests swap these to observe or stall the guest-I/O
// phase without needing a real git repo or SSH server. Production
// callers see the real implementations from the workspace package.
var (
workspaceInspectRepoFunc = ws.InspectRepo
workspaceImportFunc = ws.ImportRepoToGuest
)
func (d *Daemon) ExportVMWorkspace(ctx context.Context, params api.WorkspaceExportParams) (api.WorkspaceExportResult, error) {
guestPath := strings.TrimSpace(params.GuestPath)
if guestPath == "" {
@ -28,6 +36,12 @@ func (d *Daemon) ExportVMWorkspace(ctx context.Context, params api.WorkspaceExpo
if vm.State != model.VMStateRunning || !system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
return api.WorkspaceExportResult{}, fmt.Errorf("vm %q is not running", vm.Name)
}
// Serialise with any in-flight workspace.prepare on the same VM so
// we never snapshot a half-streamed tar. Does not block vm stop /
// delete / restart — those only take the VM mutex.
unlock := d.workspaceLocks.lock(vm.ID)
defer unlock()
client, err := d.dialGuest(ctx, net.JoinHostPort(vm.Runtime.GuestIP, "22"))
if err != nil {
return api.WorkspaceExportResult{}, fmt.Errorf("dial guest: %w", err)
@ -113,23 +127,37 @@ func (d *Daemon) PrepareVMWorkspace(ctx context.Context, params api.VMWorkspaceP
if branchName == "" && strings.TrimSpace(params.From) != "" {
return model.WorkspacePrepareResult{}, errors.New("workspace from requires branch")
}
var prepared model.WorkspacePrepareResult
_, err = d.withVMLockByRef(ctx, params.IDOrName, func(vm model.VMRecord) (model.VMRecord, error) {
// Phase 1: acquire the VM mutex ONLY long enough to verify state
// and snapshot the fields we need (IP, PID, api sock). Release it
// before any SSH or tar I/O so this slow operation cannot block
// vm stop / vm delete / vm restart on the same VM.
vm, err := d.withVMLockByRef(ctx, params.IDOrName, func(vm model.VMRecord) (model.VMRecord, error) {
if vm.State != model.VMStateRunning || !system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
return model.VMRecord{}, fmt.Errorf("vm %q is not running", vm.Name)
}
result, err := d.prepareVMWorkspaceLocked(ctx, vm, strings.TrimSpace(params.SourcePath), guestPath, branchName, fromRef, mode, params.ReadOnly)
if err != nil {
return model.VMRecord{}, err
}
prepared = result
return vm, nil
})
return prepared, err
if err != nil {
return model.WorkspacePrepareResult{}, err
}
// Phase 2: serialise concurrent workspace operations on THIS vm
// (so two prepares don't interleave tar streams), but do not
// block lifecycle ops. If the VM gets stopped or deleted mid-
// flight, the SSH dial or stream will fail naturally; ctx
// cancellation propagates through.
unlock := d.workspaceLocks.lock(vm.ID)
defer unlock()
return d.prepareVMWorkspaceGuestIO(ctx, vm, strings.TrimSpace(params.SourcePath), guestPath, branchName, fromRef, mode, params.ReadOnly)
}
func (d *Daemon) prepareVMWorkspaceLocked(ctx context.Context, vm model.VMRecord, sourcePath, guestPath, branchName, fromRef string, mode model.WorkspacePrepareMode, readOnly bool) (model.WorkspacePrepareResult, error) {
spec, err := ws.InspectRepo(ctx, sourcePath, branchName, fromRef)
// prepareVMWorkspaceGuestIO performs the actual guest-side work:
// inspect the local repo, dial SSH, stream the tar, optionally chmod
// readonly. It is called without holding the VM mutex.
func (d *Daemon) prepareVMWorkspaceGuestIO(ctx context.Context, vm model.VMRecord, sourcePath, guestPath, branchName, fromRef string, mode model.WorkspacePrepareMode, readOnly bool) (model.WorkspacePrepareResult, error) {
spec, err := workspaceInspectRepoFunc(ctx, sourcePath, branchName, fromRef)
if err != nil {
return model.WorkspacePrepareResult{}, err
}
@ -145,7 +173,7 @@ func (d *Daemon) prepareVMWorkspaceLocked(ctx context.Context, vm model.VMRecord
return model.WorkspacePrepareResult{}, fmt.Errorf("dial guest ssh: %w", err)
}
defer client.Close()
if err := ws.ImportRepoToGuest(ctx, client, spec, guestPath, mode); err != nil {
if err := workspaceImportFunc(ctx, client, spec, guestPath, mode); err != nil {
return model.WorkspacePrepareResult{}, err
}
if readOnly {