workspace prepare: release VM mutex before guest I/O

Previously withVMLockByRef held the per-VM mutex across InspectRepo, waitForGuestSSH, dialGuest, ImportRepoToGuest (the tar stream!), and the readonly chmod. A large repo could block `vm stop` / `vm delete` / `vm restart` on the same VM for however long the import took. Split into two phases: 1. VM mutex held briefly to validate state (running + PID alive) and snapshot the fields needed for SSH (guest IP, api sock). 2. VM mutex released. Acquire workspaceLocks[id] — a separate per-VM mutex scoped to workspace.prepare / workspace.export — for the guest I/O phase. Lifecycle ops (stop/delete/restart/set) only take vmLocks, so they no longer queue behind a slow import. Two concurrent prepares on the same VM still serialise via workspaceLocks so tar streams don't interleave. ExportVMWorkspace also acquires workspaceLocks to avoid snapshotting a half-streamed import. Two regression tests (sequential — they swap package-level seams): ReleasesVMLockDuringGuestIO: stall the import fake, assert the VM mutex is acquirable from another goroutine during the stall. SerialisesConcurrentPreparesOnSameVM: 3 concurrent prepares, assert Import is only ever invoked 1-at-a-time per VM. ARCHITECTURE.md documents the split + updated lock ordering.
2026-04-19 13:32:42 -03:00 · 2026-04-19 13:32:42 -03:00 · 6cd52d12f4
commit 6cd52d12f4
parent 99de42385f
4 changed files with 265 additions and 22 deletions
--- a/internal/daemon/workspace_test.go
+++ b/internal/daemon/workspace_test.go
@ -7,9 +7,12 @@ import (
 	"os"
 	"path/filepath"
 	"strings"
+	"sync/atomic"
 	"testing"
+	"time"

 	"banger/internal/api"
+	"banger/internal/daemon/workspace"
 	"banger/internal/model"
 )

@ -356,6 +359,200 @@ func TestExportVMWorkspace_MultipleChangedFiles(t *testing.T) {
 	}
 }

+// TestPrepareVMWorkspace_ReleasesVMLockDuringGuestIO is a regression
+// guard for an earlier design that held the per-VM mutex across SSH
+// dial, tar streaming, and remote chmod. A long import could then
+// block unrelated lifecycle ops (vm stop / delete / restart) on the
+// same VM until it completed. The fix switched to a dedicated
+// workspaceLocks set for I/O, with vmLocks held only for the brief
+// state-validation phase. This test kicks off a prepare that blocks
+// inside the import step and then asserts the VM mutex is acquirable
+// while the prepare is mid-flight.
+func TestPrepareVMWorkspace_ReleasesVMLockDuringGuestIO(t *testing.T) {
+	// Not parallel: mutates package-level workspaceInspectRepoFunc /
+	// workspaceImportFunc seams, which the other prepare-concurrency
+	// test also swaps.
+	ctx := context.Background()
+
+	apiSock := filepath.Join(t.TempDir(), "fc.sock")
+	firecracker := startFakeFirecracker(t, apiSock)
+
+	vm := testVM("lockbox", "image-x", "172.16.0.210")
+	vm.State = model.VMStateRunning
+	vm.Runtime.State = model.VMStateRunning
+	vm.Runtime.PID = firecracker.Process.Pid
+	vm.Runtime.APISockPath = apiSock
+
+	d := &Daemon{
+		store:  openDaemonStore(t),
+		config: model.DaemonConfig{SSHKeyPath: filepath.Join(t.TempDir(), "id_ed25519")},
+		logger: slog.New(slog.NewTextHandler(io.Discard, nil)),
+	}
+	d.guestWaitForSSH = func(_ context.Context, _, _ string, _ time.Duration) error { return nil }
+	d.guestDial = func(_ context.Context, _, _ string) (guestSSHClient, error) {
+		return &exportGuestClient{}, nil
+	}
+	upsertDaemonVM(t, ctx, d.store, vm)
+
+	// Replace the seams. InspectRepo returns a trivial spec so the
+	// real filesystem isn't touched; Import blocks until we say go.
+	origInspect := workspaceInspectRepoFunc
+	origImport := workspaceImportFunc
+	t.Cleanup(func() {
+		workspaceInspectRepoFunc = origInspect
+		workspaceImportFunc = origImport
+	})
+
+	importStarted := make(chan struct{})
+	releaseImport := make(chan struct{})
+	workspaceInspectRepoFunc = func(context.Context, string, string, string) (workspace.RepoSpec, error) {
+		return workspace.RepoSpec{RepoName: "fake", RepoRoot: "/tmp/fake"}, nil
+	}
+	workspaceImportFunc = func(context.Context, workspace.GuestClient, workspace.RepoSpec, string, model.WorkspacePrepareMode) error {
+		close(importStarted)
+		<-releaseImport
+		return nil
+	}
+
+	// Kick off prepare in a goroutine. It will block inside the import.
+	prepareDone := make(chan error, 1)
+	go func() {
+		_, err := d.PrepareVMWorkspace(ctx, api.VMWorkspacePrepareParams{
+			IDOrName:   vm.Name,
+			SourcePath: "/tmp/fake",
+		})
+		prepareDone <- err
+	}()
+
+	// Wait for prepare to reach the guest-I/O phase (past the VM
+	// mutex) before testing the assertion.
+	select {
+	case <-importStarted:
+	case <-time.After(2 * time.Second):
+		t.Fatal("import never started; prepare blocked before reaching guest I/O")
+	}
+
+	// With the fix in place, the VM mutex is free even though the
+	// import is in flight. Acquiring it must not wait.
+	acquired := make(chan struct{})
+	go func() {
+		unlock := d.lockVMID(vm.ID)
+		close(acquired)
+		unlock()
+	}()
+	select {
+	case <-acquired:
+	case <-time.After(500 * time.Millisecond):
+		close(releaseImport) // unblock the goroutine so the test can exit
+		<-prepareDone
+		t.Fatal("VM mutex held during guest I/O — lifecycle ops would block behind workspace prepare")
+	}
+
+	// Now let the import finish and make sure prepare returns.
+	close(releaseImport)
+	select {
+	case err := <-prepareDone:
+		if err != nil {
+			t.Fatalf("prepare returned error: %v", err)
+		}
+	case <-time.After(2 * time.Second):
+		t.Fatal("prepare did not return after import unblocked")
+	}
+}
+
+// TestPrepareVMWorkspace_SerialisesConcurrentPreparesOnSameVM asserts
+// the workspaceLocks scope: two concurrent prepares on the same VM do
+// NOT interleave, even though they no longer take the core VM mutex.
+func TestPrepareVMWorkspace_SerialisesConcurrentPreparesOnSameVM(t *testing.T) {
+	// Not parallel: see note on ReleasesVMLockDuringGuestIO.
+	ctx := context.Background()
+
+	apiSock := filepath.Join(t.TempDir(), "fc.sock")
+	firecracker := startFakeFirecracker(t, apiSock)
+
+	vm := testVM("serialbox", "image-x", "172.16.0.211")
+	vm.State = model.VMStateRunning
+	vm.Runtime.State = model.VMStateRunning
+	vm.Runtime.PID = firecracker.Process.Pid
+	vm.Runtime.APISockPath = apiSock
+
+	d := &Daemon{
+		store:  openDaemonStore(t),
+		config: model.DaemonConfig{SSHKeyPath: filepath.Join(t.TempDir(), "id_ed25519")},
+		logger: slog.New(slog.NewTextHandler(io.Discard, nil)),
+	}
+	d.guestWaitForSSH = func(_ context.Context, _, _ string, _ time.Duration) error { return nil }
+	d.guestDial = func(_ context.Context, _, _ string) (guestSSHClient, error) {
+		return &exportGuestClient{}, nil
+	}
+	upsertDaemonVM(t, ctx, d.store, vm)
+
+	origInspect := workspaceInspectRepoFunc
+	origImport := workspaceImportFunc
+	t.Cleanup(func() {
+		workspaceInspectRepoFunc = origInspect
+		workspaceImportFunc = origImport
+	})
+
+	workspaceInspectRepoFunc = func(context.Context, string, string, string) (workspace.RepoSpec, error) {
+		return workspace.RepoSpec{RepoName: "fake", RepoRoot: "/tmp/fake"}, nil
+	}
+
+	// Counter of simultaneous Import calls. Should never exceed 1.
+	var active int32
+	var maxObserved int32
+	release := make(chan struct{})
+	workspaceImportFunc = func(context.Context, workspace.GuestClient, workspace.RepoSpec, string, model.WorkspacePrepareMode) error {
+		n := atomic.AddInt32(&active, 1)
+		for {
+			prev := atomic.LoadInt32(&maxObserved)
+			if n <= prev || atomic.CompareAndSwapInt32(&maxObserved, prev, n) {
+				break
+			}
+		}
+		<-release
+		atomic.AddInt32(&active, -1)
+		return nil
+	}
+
+	const n = 3
+	done := make(chan error, n)
+	for i := 0; i < n; i++ {
+		go func() {
+			_, err := d.PrepareVMWorkspace(ctx, api.VMWorkspacePrepareParams{
+				IDOrName:   vm.Name,
+				SourcePath: "/tmp/fake",
+			})
+			done <- err
+		}()
+	}
+
+	// Give goroutines a moment to queue up.
+	time.Sleep(100 * time.Millisecond)
+
+	if got := atomic.LoadInt32(&active); got != 1 {
+		close(release) // unblock to avoid hang
+		for i := 0; i < n; i++ {
+			<-done
+		}
+		t.Fatalf("%d concurrent imports, want exactly 1 (workspace lock should serialise)", got)
+	}
+
+	// Drain: release imports one at a time.
+	for i := 0; i < n; i++ {
+		release <- struct{}{}
+	}
+	close(release)
+	for i := 0; i < n; i++ {
+		if err := <-done; err != nil {
+			t.Errorf("prepare #%d error: %v", i, err)
+		}
+	}
+	if got := atomic.LoadInt32(&maxObserved); got != 1 {
+		t.Fatalf("peak concurrent imports = %d, want 1", got)
+	}
+}
+
 // TestExportVMWorkspace_DoesNotMutateRealIndex is a regression guard
 // for an earlier design where `git add -A` ran against the guest's
 // real `.git/index`, leaving staged changes behind after what the user