diff --git a/internal/daemon/snapshot.go b/internal/daemon/snapshot.go index e184bdf..f6ce45d 100644 --- a/internal/daemon/snapshot.go +++ b/internal/daemon/snapshot.go @@ -84,7 +84,7 @@ func (d *Daemon) cleanupDMSnapshot(ctx context.Context, handles dmSnapshotHandle } func (d *Daemon) removeDMSnapshot(ctx context.Context, target string) error { - deadline := time.Now().Add(3 * time.Second) + deadline := time.Now().Add(15 * time.Second) for { if _, err := d.runner.RunSudo(ctx, "dmsetup", "remove", target); err != nil { if isMissingSnapshotHandle(err) { diff --git a/internal/daemon/snapshot_test.go b/internal/daemon/snapshot_test.go index 2a24597..2411206 100644 --- a/internal/daemon/snapshot_test.go +++ b/internal/daemon/snapshot_test.go @@ -292,3 +292,23 @@ func TestCleanupDMSnapshotJoinsTeardownErrors(t *testing.T) { } runner.assertExhausted() } + +func TestRemoveDMSnapshotRetriesBusyDevice(t *testing.T) { + t.Parallel() + + busyErr := errors.New("exit status 1: device-mapper: remove ioctl on fc-rootfs-test failed: Device or resource busy") + runner := &scriptedRunner{ + t: t, + steps: []runnerStep{ + sudoStep("", busyErr, "dmsetup", "remove", "fc-rootfs-test"), + sudoStep("", busyErr, "dmsetup", "remove", "fc-rootfs-test"), + sudoStep("", nil, "dmsetup", "remove", "fc-rootfs-test"), + }, + } + d := &Daemon{runner: runner} + + if err := d.removeDMSnapshot(context.Background(), "fc-rootfs-test"); err != nil { + t.Fatalf("removeDMSnapshot returned error: %v", err) + } + runner.assertExhausted() +} diff --git a/internal/daemon/vm.go b/internal/daemon/vm.go index d3ba689..511f693 100644 --- a/internal/daemon/vm.go +++ b/internal/daemon/vm.go @@ -824,9 +824,15 @@ func (d *Daemon) cleanupRuntime(ctx context.Context, vm model.VMRecord, preserve if d.logger != nil { d.logger.Debug("cleanup runtime", append(vmLogAttrs(vm), "preserve_disks", preserveDisks)...) } - if vm.Runtime.PID > 0 && system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) { - _ = d.killVMProcess(ctx, vm.Runtime.PID) - if err := d.waitForExit(ctx, vm.Runtime.PID, vm.Runtime.APISockPath, 30*time.Second); err != nil { + cleanupPID := vm.Runtime.PID + if vm.Runtime.APISockPath != "" { + if pid, err := d.findFirecrackerPID(ctx, vm.Runtime.APISockPath); err == nil && pid > 0 { + cleanupPID = pid + } + } + if cleanupPID > 0 && system.ProcessRunning(cleanupPID, vm.Runtime.APISockPath) { + _ = d.killVMProcess(ctx, cleanupPID) + if err := d.waitForExit(ctx, cleanupPID, vm.Runtime.APISockPath, 30*time.Second); err != nil { return err } } diff --git a/internal/daemon/vm_test.go b/internal/daemon/vm_test.go index e5f9442..c6f44ba 100644 --- a/internal/daemon/vm_test.go +++ b/internal/daemon/vm_test.go @@ -2,6 +2,7 @@ package daemon import ( "context" + "errors" "fmt" "os" "os/exec" @@ -120,6 +121,7 @@ func TestReconcileStopsStaleRunningVMAndClearsRuntimeHandles(t *testing.T) { runner := &scriptedRunner{ t: t, steps: []runnerStep{ + {call: runnerCall{name: "pgrep", args: []string{"-n", "-f", apiSock}}, err: errors.New("exit status 1")}, sudoStep("", nil, "dmsetup", "remove", "fc-rootfs-stale"), sudoStep("", nil, "losetup", "-d", "/dev/loop11"), sudoStep("", nil, "losetup", "-d", "/dev/loop10"), @@ -444,6 +446,40 @@ func TestValidateStartPrereqsReportsNATUplinkFailure(t *testing.T) { runner.assertExhausted() } +func TestCleanupRuntimeRediscoversLiveFirecrackerPID(t *testing.T) { + apiSock := filepath.Join(t.TempDir(), "fc.sock") + fake := startFakeFirecrackerProcess(t, apiSock) + t.Cleanup(func() { + if fake.ProcessState == nil || !fake.ProcessState.Exited() { + _ = fake.Process.Kill() + _ = fake.Wait() + } + }) + + runner := &processKillingRunner{ + scriptedRunner: &scriptedRunner{ + t: t, + steps: []runnerStep{ + {call: runnerCall{name: "pgrep", args: []string{"-n", "-f", apiSock}}, out: []byte(strconv.Itoa(fake.Process.Pid) + "\n")}, + sudoStep("", nil, "kill", "-KILL", strconv.Itoa(fake.Process.Pid)), + }, + }, + proc: fake, + } + d := &Daemon{runner: runner} + vm := testVM("cleanup", "image-cleanup", "172.16.0.22") + vm.Runtime.PID = fake.Process.Pid + 999 + vm.Runtime.APISockPath = apiSock + + if err := d.cleanupRuntime(context.Background(), vm, true); err != nil { + t.Fatalf("cleanupRuntime returned error: %v", err) + } + runner.assertExhausted() + if systemProcessRunning(fake.Process.Pid, apiSock) { + t.Fatalf("fake firecracker pid %d still looks running", fake.Process.Pid) + } +} + func openDaemonStore(t *testing.T) *store.Store { t.Helper() db, err := store.Open(filepath.Join(t.TempDir(), "state.db")) @@ -516,6 +552,27 @@ func startFakeFirecrackerProcess(t *testing.T, apiSock string) *exec.Cmd { return nil } +type processKillingRunner struct { + *scriptedRunner + proc *exec.Cmd +} + +func (r *processKillingRunner) Run(ctx context.Context, name string, args ...string) ([]byte, error) { + return r.scriptedRunner.Run(ctx, name, args...) +} + +func (r *processKillingRunner) RunSudo(ctx context.Context, args ...string) ([]byte, error) { + out, err := r.scriptedRunner.RunSudo(ctx, args...) + if err != nil { + return out, err + } + if len(args) >= 3 && args[0] == "kill" && args[1] == "-KILL" && r.proc != nil && (r.proc.ProcessState == nil || !r.proc.ProcessState.Exited()) { + _ = r.proc.Process.Kill() + _ = r.proc.Wait() + } + return out, nil +} + func systemProcessRunning(pid int, apiSock string) bool { data, err := os.ReadFile(filepath.Join("/proc", strconv.Itoa(pid), "cmdline")) if err != nil {