workspace prepare: release VM mutex before guest I/O
Previously withVMLockByRef held the per-VM mutex across InspectRepo,
waitForGuestSSH, dialGuest, ImportRepoToGuest (the tar stream!), and
the readonly chmod. A large repo could block `vm stop` / `vm delete`
/ `vm restart` on the same VM for however long the import took.
Split into two phases:
1. VM mutex held briefly to validate state (running + PID alive)
and snapshot the fields needed for SSH (guest IP, api sock).
2. VM mutex released. Acquire workspaceLocks[id] — a separate
per-VM mutex scoped to workspace.prepare / workspace.export —
for the guest I/O phase.
Lifecycle ops (stop/delete/restart/set) only take vmLocks, so they
no longer queue behind a slow import. Two concurrent prepares on the
same VM still serialise via workspaceLocks so tar streams don't
interleave. ExportVMWorkspace also acquires workspaceLocks to avoid
snapshotting a half-streamed import.
Two regression tests (sequential — they swap package-level seams):
ReleasesVMLockDuringGuestIO: stall the import fake, assert the VM
mutex is acquirable from another goroutine during the stall.
SerialisesConcurrentPreparesOnSameVM: 3 concurrent prepares, assert
Import is only ever invoked 1-at-a-time per VM.
ARCHITECTURE.md documents the split + updated lock ordering.
This commit is contained in:
parent
99de42385f
commit
6cd52d12f4
4 changed files with 265 additions and 22 deletions
|
|
@ -7,9 +7,12 @@ import (
|
|||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"banger/internal/api"
|
||||
"banger/internal/daemon/workspace"
|
||||
"banger/internal/model"
|
||||
)
|
||||
|
||||
|
|
@ -356,6 +359,200 @@ func TestExportVMWorkspace_MultipleChangedFiles(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
// TestPrepareVMWorkspace_ReleasesVMLockDuringGuestIO is a regression
|
||||
// guard for an earlier design that held the per-VM mutex across SSH
|
||||
// dial, tar streaming, and remote chmod. A long import could then
|
||||
// block unrelated lifecycle ops (vm stop / delete / restart) on the
|
||||
// same VM until it completed. The fix switched to a dedicated
|
||||
// workspaceLocks set for I/O, with vmLocks held only for the brief
|
||||
// state-validation phase. This test kicks off a prepare that blocks
|
||||
// inside the import step and then asserts the VM mutex is acquirable
|
||||
// while the prepare is mid-flight.
|
||||
func TestPrepareVMWorkspace_ReleasesVMLockDuringGuestIO(t *testing.T) {
|
||||
// Not parallel: mutates package-level workspaceInspectRepoFunc /
|
||||
// workspaceImportFunc seams, which the other prepare-concurrency
|
||||
// test also swaps.
|
||||
ctx := context.Background()
|
||||
|
||||
apiSock := filepath.Join(t.TempDir(), "fc.sock")
|
||||
firecracker := startFakeFirecracker(t, apiSock)
|
||||
|
||||
vm := testVM("lockbox", "image-x", "172.16.0.210")
|
||||
vm.State = model.VMStateRunning
|
||||
vm.Runtime.State = model.VMStateRunning
|
||||
vm.Runtime.PID = firecracker.Process.Pid
|
||||
vm.Runtime.APISockPath = apiSock
|
||||
|
||||
d := &Daemon{
|
||||
store: openDaemonStore(t),
|
||||
config: model.DaemonConfig{SSHKeyPath: filepath.Join(t.TempDir(), "id_ed25519")},
|
||||
logger: slog.New(slog.NewTextHandler(io.Discard, nil)),
|
||||
}
|
||||
d.guestWaitForSSH = func(_ context.Context, _, _ string, _ time.Duration) error { return nil }
|
||||
d.guestDial = func(_ context.Context, _, _ string) (guestSSHClient, error) {
|
||||
return &exportGuestClient{}, nil
|
||||
}
|
||||
upsertDaemonVM(t, ctx, d.store, vm)
|
||||
|
||||
// Replace the seams. InspectRepo returns a trivial spec so the
|
||||
// real filesystem isn't touched; Import blocks until we say go.
|
||||
origInspect := workspaceInspectRepoFunc
|
||||
origImport := workspaceImportFunc
|
||||
t.Cleanup(func() {
|
||||
workspaceInspectRepoFunc = origInspect
|
||||
workspaceImportFunc = origImport
|
||||
})
|
||||
|
||||
importStarted := make(chan struct{})
|
||||
releaseImport := make(chan struct{})
|
||||
workspaceInspectRepoFunc = func(context.Context, string, string, string) (workspace.RepoSpec, error) {
|
||||
return workspace.RepoSpec{RepoName: "fake", RepoRoot: "/tmp/fake"}, nil
|
||||
}
|
||||
workspaceImportFunc = func(context.Context, workspace.GuestClient, workspace.RepoSpec, string, model.WorkspacePrepareMode) error {
|
||||
close(importStarted)
|
||||
<-releaseImport
|
||||
return nil
|
||||
}
|
||||
|
||||
// Kick off prepare in a goroutine. It will block inside the import.
|
||||
prepareDone := make(chan error, 1)
|
||||
go func() {
|
||||
_, err := d.PrepareVMWorkspace(ctx, api.VMWorkspacePrepareParams{
|
||||
IDOrName: vm.Name,
|
||||
SourcePath: "/tmp/fake",
|
||||
})
|
||||
prepareDone <- err
|
||||
}()
|
||||
|
||||
// Wait for prepare to reach the guest-I/O phase (past the VM
|
||||
// mutex) before testing the assertion.
|
||||
select {
|
||||
case <-importStarted:
|
||||
case <-time.After(2 * time.Second):
|
||||
t.Fatal("import never started; prepare blocked before reaching guest I/O")
|
||||
}
|
||||
|
||||
// With the fix in place, the VM mutex is free even though the
|
||||
// import is in flight. Acquiring it must not wait.
|
||||
acquired := make(chan struct{})
|
||||
go func() {
|
||||
unlock := d.lockVMID(vm.ID)
|
||||
close(acquired)
|
||||
unlock()
|
||||
}()
|
||||
select {
|
||||
case <-acquired:
|
||||
case <-time.After(500 * time.Millisecond):
|
||||
close(releaseImport) // unblock the goroutine so the test can exit
|
||||
<-prepareDone
|
||||
t.Fatal("VM mutex held during guest I/O — lifecycle ops would block behind workspace prepare")
|
||||
}
|
||||
|
||||
// Now let the import finish and make sure prepare returns.
|
||||
close(releaseImport)
|
||||
select {
|
||||
case err := <-prepareDone:
|
||||
if err != nil {
|
||||
t.Fatalf("prepare returned error: %v", err)
|
||||
}
|
||||
case <-time.After(2 * time.Second):
|
||||
t.Fatal("prepare did not return after import unblocked")
|
||||
}
|
||||
}
|
||||
|
||||
// TestPrepareVMWorkspace_SerialisesConcurrentPreparesOnSameVM asserts
|
||||
// the workspaceLocks scope: two concurrent prepares on the same VM do
|
||||
// NOT interleave, even though they no longer take the core VM mutex.
|
||||
func TestPrepareVMWorkspace_SerialisesConcurrentPreparesOnSameVM(t *testing.T) {
|
||||
// Not parallel: see note on ReleasesVMLockDuringGuestIO.
|
||||
ctx := context.Background()
|
||||
|
||||
apiSock := filepath.Join(t.TempDir(), "fc.sock")
|
||||
firecracker := startFakeFirecracker(t, apiSock)
|
||||
|
||||
vm := testVM("serialbox", "image-x", "172.16.0.211")
|
||||
vm.State = model.VMStateRunning
|
||||
vm.Runtime.State = model.VMStateRunning
|
||||
vm.Runtime.PID = firecracker.Process.Pid
|
||||
vm.Runtime.APISockPath = apiSock
|
||||
|
||||
d := &Daemon{
|
||||
store: openDaemonStore(t),
|
||||
config: model.DaemonConfig{SSHKeyPath: filepath.Join(t.TempDir(), "id_ed25519")},
|
||||
logger: slog.New(slog.NewTextHandler(io.Discard, nil)),
|
||||
}
|
||||
d.guestWaitForSSH = func(_ context.Context, _, _ string, _ time.Duration) error { return nil }
|
||||
d.guestDial = func(_ context.Context, _, _ string) (guestSSHClient, error) {
|
||||
return &exportGuestClient{}, nil
|
||||
}
|
||||
upsertDaemonVM(t, ctx, d.store, vm)
|
||||
|
||||
origInspect := workspaceInspectRepoFunc
|
||||
origImport := workspaceImportFunc
|
||||
t.Cleanup(func() {
|
||||
workspaceInspectRepoFunc = origInspect
|
||||
workspaceImportFunc = origImport
|
||||
})
|
||||
|
||||
workspaceInspectRepoFunc = func(context.Context, string, string, string) (workspace.RepoSpec, error) {
|
||||
return workspace.RepoSpec{RepoName: "fake", RepoRoot: "/tmp/fake"}, nil
|
||||
}
|
||||
|
||||
// Counter of simultaneous Import calls. Should never exceed 1.
|
||||
var active int32
|
||||
var maxObserved int32
|
||||
release := make(chan struct{})
|
||||
workspaceImportFunc = func(context.Context, workspace.GuestClient, workspace.RepoSpec, string, model.WorkspacePrepareMode) error {
|
||||
n := atomic.AddInt32(&active, 1)
|
||||
for {
|
||||
prev := atomic.LoadInt32(&maxObserved)
|
||||
if n <= prev || atomic.CompareAndSwapInt32(&maxObserved, prev, n) {
|
||||
break
|
||||
}
|
||||
}
|
||||
<-release
|
||||
atomic.AddInt32(&active, -1)
|
||||
return nil
|
||||
}
|
||||
|
||||
const n = 3
|
||||
done := make(chan error, n)
|
||||
for i := 0; i < n; i++ {
|
||||
go func() {
|
||||
_, err := d.PrepareVMWorkspace(ctx, api.VMWorkspacePrepareParams{
|
||||
IDOrName: vm.Name,
|
||||
SourcePath: "/tmp/fake",
|
||||
})
|
||||
done <- err
|
||||
}()
|
||||
}
|
||||
|
||||
// Give goroutines a moment to queue up.
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
|
||||
if got := atomic.LoadInt32(&active); got != 1 {
|
||||
close(release) // unblock to avoid hang
|
||||
for i := 0; i < n; i++ {
|
||||
<-done
|
||||
}
|
||||
t.Fatalf("%d concurrent imports, want exactly 1 (workspace lock should serialise)", got)
|
||||
}
|
||||
|
||||
// Drain: release imports one at a time.
|
||||
for i := 0; i < n; i++ {
|
||||
release <- struct{}{}
|
||||
}
|
||||
close(release)
|
||||
for i := 0; i < n; i++ {
|
||||
if err := <-done; err != nil {
|
||||
t.Errorf("prepare #%d error: %v", i, err)
|
||||
}
|
||||
}
|
||||
if got := atomic.LoadInt32(&maxObserved); got != 1 {
|
||||
t.Fatalf("peak concurrent imports = %d, want 1", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestExportVMWorkspace_DoesNotMutateRealIndex is a regression guard
|
||||
// for an earlier design where `git add -A` ran against the guest's
|
||||
// real `.git/index`, leaving staged changes behind after what the user
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue