workspace prepare: release VM mutex before guest I/O

Previously withVMLockByRef held the per-VM mutex across InspectRepo, waitForGuestSSH, dialGuest, ImportRepoToGuest (the tar stream!), and the readonly chmod. A large repo could block `vm stop` / `vm delete` / `vm restart` on the same VM for however long the import took. Split into two phases: 1. VM mutex held briefly to validate state (running + PID alive) and snapshot the fields needed for SSH (guest IP, api sock). 2. VM mutex released. Acquire workspaceLocks[id] — a separate per-VM mutex scoped to workspace.prepare / workspace.export — for the guest I/O phase. Lifecycle ops (stop/delete/restart/set) only take vmLocks, so they no longer queue behind a slow import. Two concurrent prepares on the same VM still serialise via workspaceLocks so tar streams don't interleave. ExportVMWorkspace also acquires workspaceLocks to avoid snapshotting a half-streamed import. Two regression tests (sequential — they swap package-level seams): ReleasesVMLockDuringGuestIO: stall the import fake, assert the VM mutex is acquirable from another goroutine during the stall. SerialisesConcurrentPreparesOnSameVM: 3 concurrent prepares, assert Import is only ever invoked 1-at-a-time per VM. ARCHITECTURE.md documents the split + updated lock ordering.
2026-04-19 13:32:42 -03:00 · 2026-04-19 13:32:42 -03:00 · 6cd52d12f4
commit 6cd52d12f4
parent 99de42385f
4 changed files with 265 additions and 22 deletions
--- a/internal/daemon/workspace.go
+++ b/internal/daemon/workspace.go
@ -16,6 +16,14 @@ import (
 	"banger/internal/system"
 )

+// Test seams. Tests swap these to observe or stall the guest-I/O
+// phase without needing a real git repo or SSH server. Production
+// callers see the real implementations from the workspace package.
+var (
+	workspaceInspectRepoFunc = ws.InspectRepo
+	workspaceImportFunc      = ws.ImportRepoToGuest
+)
+
 func (d *Daemon) ExportVMWorkspace(ctx context.Context, params api.WorkspaceExportParams) (api.WorkspaceExportResult, error) {
 	guestPath := strings.TrimSpace(params.GuestPath)
 	if guestPath == "" {
@ -28,6 +36,12 @@ func (d *Daemon) ExportVMWorkspace(ctx context.Context, params api.WorkspaceExpo
 	if vm.State != model.VMStateRunning || !system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
 		return api.WorkspaceExportResult{}, fmt.Errorf("vm %q is not running", vm.Name)
 	}
+	// Serialise with any in-flight workspace.prepare on the same VM so
+	// we never snapshot a half-streamed tar. Does not block vm stop /
+	// delete / restart — those only take the VM mutex.
+	unlock := d.workspaceLocks.lock(vm.ID)
+	defer unlock()
+
 	client, err := d.dialGuest(ctx, net.JoinHostPort(vm.Runtime.GuestIP, "22"))
 	if err != nil {
 		return api.WorkspaceExportResult{}, fmt.Errorf("dial guest: %w", err)
@ -113,23 +127,37 @@ func (d *Daemon) PrepareVMWorkspace(ctx context.Context, params api.VMWorkspaceP
 	if branchName == "" && strings.TrimSpace(params.From) != "" {
 		return model.WorkspacePrepareResult{}, errors.New("workspace from requires branch")
 	}
-	var prepared model.WorkspacePrepareResult
-	_, err = d.withVMLockByRef(ctx, params.IDOrName, func(vm model.VMRecord) (model.VMRecord, error) {
+
+	// Phase 1: acquire the VM mutex ONLY long enough to verify state
+	// and snapshot the fields we need (IP, PID, api sock). Release it
+	// before any SSH or tar I/O so this slow operation cannot block
+	// vm stop / vm delete / vm restart on the same VM.
+	vm, err := d.withVMLockByRef(ctx, params.IDOrName, func(vm model.VMRecord) (model.VMRecord, error) {
 		if vm.State != model.VMStateRunning || !system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
 			return model.VMRecord{}, fmt.Errorf("vm %q is not running", vm.Name)
 		}
-		result, err := d.prepareVMWorkspaceLocked(ctx, vm, strings.TrimSpace(params.SourcePath), guestPath, branchName, fromRef, mode, params.ReadOnly)
-		if err != nil {
-			return model.VMRecord{}, err
-		}
-		prepared = result
 		return vm, nil
 	})
-	return prepared, err
+	if err != nil {
+		return model.WorkspacePrepareResult{}, err
+	}
+
+	// Phase 2: serialise concurrent workspace operations on THIS vm
+	// (so two prepares don't interleave tar streams), but do not
+	// block lifecycle ops. If the VM gets stopped or deleted mid-
+	// flight, the SSH dial or stream will fail naturally; ctx
+	// cancellation propagates through.
+	unlock := d.workspaceLocks.lock(vm.ID)
+	defer unlock()
+
+	return d.prepareVMWorkspaceGuestIO(ctx, vm, strings.TrimSpace(params.SourcePath), guestPath, branchName, fromRef, mode, params.ReadOnly)
 }

-func (d *Daemon) prepareVMWorkspaceLocked(ctx context.Context, vm model.VMRecord, sourcePath, guestPath, branchName, fromRef string, mode model.WorkspacePrepareMode, readOnly bool) (model.WorkspacePrepareResult, error) {
-	spec, err := ws.InspectRepo(ctx, sourcePath, branchName, fromRef)
+// prepareVMWorkspaceGuestIO performs the actual guest-side work:
+// inspect the local repo, dial SSH, stream the tar, optionally chmod
+// readonly. It is called without holding the VM mutex.
+func (d *Daemon) prepareVMWorkspaceGuestIO(ctx context.Context, vm model.VMRecord, sourcePath, guestPath, branchName, fromRef string, mode model.WorkspacePrepareMode, readOnly bool) (model.WorkspacePrepareResult, error) {
+	spec, err := workspaceInspectRepoFunc(ctx, sourcePath, branchName, fromRef)
 	if err != nil {
 		return model.WorkspacePrepareResult{}, err
 	}
@ -145,7 +173,7 @@ func (d *Daemon) prepareVMWorkspaceLocked(ctx context.Context, vm model.VMRecord
 		return model.WorkspacePrepareResult{}, fmt.Errorf("dial guest ssh: %w", err)
 	}
 	defer client.Close()
-	if err := ws.ImportRepoToGuest(ctx, client, spec, guestPath, mode); err != nil {
+	if err := workspaceImportFunc(ctx, client, spec, guestPath, mode); err != nil {
 		return model.WorkspacePrepareResult{}, err
 	}
 	if readOnly {