Harden VM stop cleanup for stale snapshots

Stop and delete could fail with device-mapper busy errors when the persisted Firecracker PID was stale or the kernel needed longer to release the root snapshot.

Rediscover a live Firecracker process by API socket during cleanup, kill and wait on that PID instead of trusting only the stored runtime PID, and extend dm snapshot removal retries for transient busy handles.

Add daemon regressions for stale-runtime reconcile, rediscovered process cleanup, and repeated busy dm removal. Validate with go test ./..., make build, and a live ./banger vm stop debug-ssh run that now exits cleanly.
This commit is contained in:
Thales Maciel 2026-03-18 12:28:15 -03:00
parent 787b234029
commit 7b7f7e676c
No known key found for this signature in database
GPG key ID: 33112E6833C34679
4 changed files with 87 additions and 4 deletions

View file

@ -2,6 +2,7 @@ package daemon
import (
"context"
"errors"
"fmt"
"os"
"os/exec"
@ -120,6 +121,7 @@ func TestReconcileStopsStaleRunningVMAndClearsRuntimeHandles(t *testing.T) {
runner := &scriptedRunner{
t: t,
steps: []runnerStep{
{call: runnerCall{name: "pgrep", args: []string{"-n", "-f", apiSock}}, err: errors.New("exit status 1")},
sudoStep("", nil, "dmsetup", "remove", "fc-rootfs-stale"),
sudoStep("", nil, "losetup", "-d", "/dev/loop11"),
sudoStep("", nil, "losetup", "-d", "/dev/loop10"),
@ -444,6 +446,40 @@ func TestValidateStartPrereqsReportsNATUplinkFailure(t *testing.T) {
runner.assertExhausted()
}
func TestCleanupRuntimeRediscoversLiveFirecrackerPID(t *testing.T) {
apiSock := filepath.Join(t.TempDir(), "fc.sock")
fake := startFakeFirecrackerProcess(t, apiSock)
t.Cleanup(func() {
if fake.ProcessState == nil || !fake.ProcessState.Exited() {
_ = fake.Process.Kill()
_ = fake.Wait()
}
})
runner := &processKillingRunner{
scriptedRunner: &scriptedRunner{
t: t,
steps: []runnerStep{
{call: runnerCall{name: "pgrep", args: []string{"-n", "-f", apiSock}}, out: []byte(strconv.Itoa(fake.Process.Pid) + "\n")},
sudoStep("", nil, "kill", "-KILL", strconv.Itoa(fake.Process.Pid)),
},
},
proc: fake,
}
d := &Daemon{runner: runner}
vm := testVM("cleanup", "image-cleanup", "172.16.0.22")
vm.Runtime.PID = fake.Process.Pid + 999
vm.Runtime.APISockPath = apiSock
if err := d.cleanupRuntime(context.Background(), vm, true); err != nil {
t.Fatalf("cleanupRuntime returned error: %v", err)
}
runner.assertExhausted()
if systemProcessRunning(fake.Process.Pid, apiSock) {
t.Fatalf("fake firecracker pid %d still looks running", fake.Process.Pid)
}
}
func openDaemonStore(t *testing.T) *store.Store {
t.Helper()
db, err := store.Open(filepath.Join(t.TempDir(), "state.db"))
@ -516,6 +552,27 @@ func startFakeFirecrackerProcess(t *testing.T, apiSock string) *exec.Cmd {
return nil
}
type processKillingRunner struct {
*scriptedRunner
proc *exec.Cmd
}
func (r *processKillingRunner) Run(ctx context.Context, name string, args ...string) ([]byte, error) {
return r.scriptedRunner.Run(ctx, name, args...)
}
func (r *processKillingRunner) RunSudo(ctx context.Context, args ...string) ([]byte, error) {
out, err := r.scriptedRunner.RunSudo(ctx, args...)
if err != nil {
return out, err
}
if len(args) >= 3 && args[0] == "kill" && args[1] == "-KILL" && r.proc != nil && (r.proc.ProcessState == nil || !r.proc.ProcessState.Exited()) {
_ = r.proc.Process.Kill()
_ = r.proc.Wait()
}
return out, nil
}
func systemProcessRunning(pid int, apiSock string) bool {
data, err := os.ReadFile(filepath.Join("/proc", strconv.Itoa(pid), "cmdline"))
if err != nil {