From 7b7f7e676c5df37d8852487e9cf2b5f211e4a253 Mon Sep 17 00:00:00 2001 From: Thales Maciel Date: Wed, 18 Mar 2026 12:28:15 -0300 Subject: [PATCH] Harden VM stop cleanup for stale snapshots Stop and delete could fail with device-mapper busy errors when the persisted Firecracker PID was stale or the kernel needed longer to release the root snapshot. Rediscover a live Firecracker process by API socket during cleanup, kill and wait on that PID instead of trusting only the stored runtime PID, and extend dm snapshot removal retries for transient busy handles. Add daemon regressions for stale-runtime reconcile, rediscovered process cleanup, and repeated busy dm removal. Validate with go test ./..., make build, and a live ./banger vm stop debug-ssh run that now exits cleanly. --- internal/daemon/snapshot.go | 2 +- internal/daemon/snapshot_test.go | 20 +++++++++++ internal/daemon/vm.go | 12 +++++-- internal/daemon/vm_test.go | 57 ++++++++++++++++++++++++++++++++ 4 files changed, 87 insertions(+), 4 deletions(-) diff --git a/internal/daemon/snapshot.go b/internal/daemon/snapshot.go index e184bdf..f6ce45d 100644 --- a/internal/daemon/snapshot.go +++ b/internal/daemon/snapshot.go @@ -84,7 +84,7 @@ func (d *Daemon) cleanupDMSnapshot(ctx context.Context, handles dmSnapshotHandle } func (d *Daemon) removeDMSnapshot(ctx context.Context, target string) error { - deadline := time.Now().Add(3 * time.Second) + deadline := time.Now().Add(15 * time.Second) for { if _, err := d.runner.RunSudo(ctx, "dmsetup", "remove", target); err != nil { if isMissingSnapshotHandle(err) { diff --git a/internal/daemon/snapshot_test.go b/internal/daemon/snapshot_test.go index 2a24597..2411206 100644 --- a/internal/daemon/snapshot_test.go +++ b/internal/daemon/snapshot_test.go @@ -292,3 +292,23 @@ func TestCleanupDMSnapshotJoinsTeardownErrors(t *testing.T) { } runner.assertExhausted() } + +func TestRemoveDMSnapshotRetriesBusyDevice(t *testing.T) { + t.Parallel() + + busyErr := errors.New("exit status 1: device-mapper: remove ioctl on fc-rootfs-test failed: Device or resource busy") + runner := &scriptedRunner{ + t: t, + steps: []runnerStep{ + sudoStep("", busyErr, "dmsetup", "remove", "fc-rootfs-test"), + sudoStep("", busyErr, "dmsetup", "remove", "fc-rootfs-test"), + sudoStep("", nil, "dmsetup", "remove", "fc-rootfs-test"), + }, + } + d := &Daemon{runner: runner} + + if err := d.removeDMSnapshot(context.Background(), "fc-rootfs-test"); err != nil { + t.Fatalf("removeDMSnapshot returned error: %v", err) + } + runner.assertExhausted() +} diff --git a/internal/daemon/vm.go b/internal/daemon/vm.go index d3ba689..511f693 100644 --- a/internal/daemon/vm.go +++ b/internal/daemon/vm.go @@ -824,9 +824,15 @@ func (d *Daemon) cleanupRuntime(ctx context.Context, vm model.VMRecord, preserve if d.logger != nil { d.logger.Debug("cleanup runtime", append(vmLogAttrs(vm), "preserve_disks", preserveDisks)...) } - if vm.Runtime.PID > 0 && system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) { - _ = d.killVMProcess(ctx, vm.Runtime.PID) - if err := d.waitForExit(ctx, vm.Runtime.PID, vm.Runtime.APISockPath, 30*time.Second); err != nil { + cleanupPID := vm.Runtime.PID + if vm.Runtime.APISockPath != "" { + if pid, err := d.findFirecrackerPID(ctx, vm.Runtime.APISockPath); err == nil && pid > 0 { + cleanupPID = pid + } + } + if cleanupPID > 0 && system.ProcessRunning(cleanupPID, vm.Runtime.APISockPath) { + _ = d.killVMProcess(ctx, cleanupPID) + if err := d.waitForExit(ctx, cleanupPID, vm.Runtime.APISockPath, 30*time.Second); err != nil { return err } } diff --git a/internal/daemon/vm_test.go b/internal/daemon/vm_test.go index e5f9442..c6f44ba 100644 --- a/internal/daemon/vm_test.go +++ b/internal/daemon/vm_test.go @@ -2,6 +2,7 @@ package daemon import ( "context" + "errors" "fmt" "os" "os/exec" @@ -120,6 +121,7 @@ func TestReconcileStopsStaleRunningVMAndClearsRuntimeHandles(t *testing.T) { runner := &scriptedRunner{ t: t, steps: []runnerStep{ + {call: runnerCall{name: "pgrep", args: []string{"-n", "-f", apiSock}}, err: errors.New("exit status 1")}, sudoStep("", nil, "dmsetup", "remove", "fc-rootfs-stale"), sudoStep("", nil, "losetup", "-d", "/dev/loop11"), sudoStep("", nil, "losetup", "-d", "/dev/loop10"), @@ -444,6 +446,40 @@ func TestValidateStartPrereqsReportsNATUplinkFailure(t *testing.T) { runner.assertExhausted() } +func TestCleanupRuntimeRediscoversLiveFirecrackerPID(t *testing.T) { + apiSock := filepath.Join(t.TempDir(), "fc.sock") + fake := startFakeFirecrackerProcess(t, apiSock) + t.Cleanup(func() { + if fake.ProcessState == nil || !fake.ProcessState.Exited() { + _ = fake.Process.Kill() + _ = fake.Wait() + } + }) + + runner := &processKillingRunner{ + scriptedRunner: &scriptedRunner{ + t: t, + steps: []runnerStep{ + {call: runnerCall{name: "pgrep", args: []string{"-n", "-f", apiSock}}, out: []byte(strconv.Itoa(fake.Process.Pid) + "\n")}, + sudoStep("", nil, "kill", "-KILL", strconv.Itoa(fake.Process.Pid)), + }, + }, + proc: fake, + } + d := &Daemon{runner: runner} + vm := testVM("cleanup", "image-cleanup", "172.16.0.22") + vm.Runtime.PID = fake.Process.Pid + 999 + vm.Runtime.APISockPath = apiSock + + if err := d.cleanupRuntime(context.Background(), vm, true); err != nil { + t.Fatalf("cleanupRuntime returned error: %v", err) + } + runner.assertExhausted() + if systemProcessRunning(fake.Process.Pid, apiSock) { + t.Fatalf("fake firecracker pid %d still looks running", fake.Process.Pid) + } +} + func openDaemonStore(t *testing.T) *store.Store { t.Helper() db, err := store.Open(filepath.Join(t.TempDir(), "state.db")) @@ -516,6 +552,27 @@ func startFakeFirecrackerProcess(t *testing.T, apiSock string) *exec.Cmd { return nil } +type processKillingRunner struct { + *scriptedRunner + proc *exec.Cmd +} + +func (r *processKillingRunner) Run(ctx context.Context, name string, args ...string) ([]byte, error) { + return r.scriptedRunner.Run(ctx, name, args...) +} + +func (r *processKillingRunner) RunSudo(ctx context.Context, args ...string) ([]byte, error) { + out, err := r.scriptedRunner.RunSudo(ctx, args...) + if err != nil { + return out, err + } + if len(args) >= 3 && args[0] == "kill" && args[1] == "-KILL" && r.proc != nil && (r.proc.ProcessState == nil || !r.proc.ProcessState.Exited()) { + _ = r.proc.Process.Kill() + _ = r.proc.Wait() + } + return out, nil +} + func systemProcessRunning(pid int, apiSock string) bool { data, err := os.ReadFile(filepath.Join("/proc", strconv.Itoa(pid), "cmdline")) if err != nil {