Fall back to forced cleanup when stop hangs

Some guests ignore CtrlAltDel long enough that vm stop sat in wait_for_exit until the client disconnected, leaving the daemon to report context canceled instead of actually stopping the VM.

Keep the graceful shutdown attempt, but treat a grace-window timeout as a fallback case: log the timeout stage and continue into the existing forced cleanup path instead of returning immediately. Add a fake Firecracker API/process regression that exercises the timeout-to-cleanup transition.

Validate with go test ./..., make build, and a live ./banger vm stop ea run that now completes instead of hanging.
This commit is contained in:
Thales Maciel 2026-03-18 13:44:59 -03:00
parent 3a336f0eeb
commit 2d5bcb5516
No known key found for this signature in database
GPG key ID: 33112E6833C34679
2 changed files with 101 additions and 4 deletions

View file

@ -18,6 +18,11 @@ import (
"banger/internal/vmdns"
)
var (
errWaitForExitTimeout = errors.New("timed out waiting for VM to exit")
gracefulShutdownWait = 10 * time.Second
)
func (d *Daemon) CreateVM(ctx context.Context, params api.VMCreateParams) (vm model.VMRecord, err error) {
d.mu.Lock()
defer d.mu.Unlock()
@ -320,9 +325,12 @@ func (d *Daemon) StopVM(ctx context.Context, idOrName string) (vm model.VMRecord
return model.VMRecord{}, err
}
op.stage("wait_for_exit", "pid", vm.Runtime.PID)
if err := d.waitForExit(ctx, vm.Runtime.PID, vm.Runtime.APISockPath, 30*time.Second); err != nil {
if err := d.waitForExit(ctx, vm.Runtime.PID, vm.Runtime.APISockPath, gracefulShutdownWait); err != nil {
if !errors.Is(err, errWaitForExitTimeout) {
return model.VMRecord{}, err
}
op.stage("graceful_shutdown_timeout", "pid", vm.Runtime.PID)
}
op.stage("cleanup_runtime")
if err := d.cleanupRuntime(ctx, vm, true); err != nil {
return model.VMRecord{}, err
@ -377,8 +385,11 @@ func (d *Daemon) KillVM(ctx context.Context, params api.VMKillParams) (vm model.
}
op.stage("wait_for_exit", "pid", vm.Runtime.PID)
if err := d.waitForExit(ctx, vm.Runtime.PID, vm.Runtime.APISockPath, 30*time.Second); err != nil {
if !errors.Is(err, errWaitForExitTimeout) {
return model.VMRecord{}, err
}
op.stage("signal_timeout", "pid", vm.Runtime.PID, "signal", signal)
}
op.stage("cleanup_runtime")
if err := d.cleanupRuntime(ctx, vm, true); err != nil {
return model.VMRecord{}, err
@ -810,7 +821,7 @@ func (d *Daemon) waitForExit(ctx context.Context, pid int, apiSock string, timeo
return nil
}
if time.Now().After(deadline) {
return fmt.Errorf("timed out waiting for VM to exit")
return errWaitForExitTimeout
}
select {
case <-ctx.Done():

View file

@ -4,6 +4,8 @@ import (
"context"
"errors"
"fmt"
"net"
"net/http"
"os"
"os/exec"
"path/filepath"
@ -480,6 +482,62 @@ func TestCleanupRuntimeRediscoversLiveFirecrackerPID(t *testing.T) {
}
}
func TestStopVMFallsBackToForcedCleanupAfterGracefulTimeout(t *testing.T) {
ctx := context.Background()
db := openDaemonStore(t)
apiSock := filepath.Join(t.TempDir(), "fc.sock")
startFakeFirecrackerAPI(t, apiSock)
fake := startFakeFirecrackerProcess(t, apiSock)
t.Cleanup(func() {
if fake.ProcessState == nil || !fake.ProcessState.Exited() {
_ = fake.Process.Kill()
_ = fake.Wait()
}
})
oldGracefulWait := gracefulShutdownWait
gracefulShutdownWait = 50 * time.Millisecond
t.Cleanup(func() {
gracefulShutdownWait = oldGracefulWait
})
vm := testVM("stubborn", "image-stubborn", "172.16.0.23")
vm.State = model.VMStateRunning
vm.Runtime.State = model.VMStateRunning
vm.Runtime.PID = fake.Process.Pid
vm.Runtime.APISockPath = apiSock
if err := db.UpsertVM(ctx, vm); err != nil {
t.Fatalf("UpsertVM: %v", err)
}
runner := &processKillingRunner{
scriptedRunner: &scriptedRunner{
t: t,
steps: []runnerStep{
sudoStep("", nil, "chown", fmt.Sprintf("%d:%d", os.Getuid(), os.Getgid()), apiSock),
sudoStep("", nil, "chmod", "600", apiSock),
{call: runnerCall{name: "pgrep", args: []string{"-n", "-f", apiSock}}, out: []byte(strconv.Itoa(fake.Process.Pid) + "\n")},
sudoStep("", nil, "kill", "-KILL", strconv.Itoa(fake.Process.Pid)),
},
},
proc: fake,
}
d := &Daemon{store: db, runner: runner}
got, err := d.StopVM(ctx, vm.ID)
if err != nil {
t.Fatalf("StopVM returned error: %v", err)
}
runner.assertExhausted()
if got.State != model.VMStateStopped || got.Runtime.State != model.VMStateStopped {
t.Fatalf("StopVM state = %s/%s, want stopped", got.State, got.Runtime.State)
}
if got.Runtime.PID != 0 || got.Runtime.APISockPath != "" {
t.Fatalf("runtime handles not cleared: %+v", got.Runtime)
}
}
func openDaemonStore(t *testing.T) *store.Store {
t.Helper()
db, err := store.Open(filepath.Join(t.TempDir(), "state.db"))
@ -552,6 +610,34 @@ func startFakeFirecrackerProcess(t *testing.T, apiSock string) *exec.Cmd {
return nil
}
func startFakeFirecrackerAPI(t *testing.T, apiSock string) {
t.Helper()
if err := os.MkdirAll(filepath.Dir(apiSock), 0o755); err != nil {
t.Fatalf("MkdirAll(%s): %v", filepath.Dir(apiSock), err)
}
listener, err := net.Listen("unix", apiSock)
if err != nil {
t.Fatalf("listen unix %s: %v", apiSock, err)
}
mux := http.NewServeMux()
mux.HandleFunc("/actions", func(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPut {
http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
return
}
w.WriteHeader(http.StatusNoContent)
})
server := &http.Server{Handler: mux}
go func() {
_ = server.Serve(listener)
}()
t.Cleanup(func() {
_ = server.Close()
_ = os.Remove(apiSock)
})
}
type processKillingRunner struct {
*scriptedRunner
proc *exec.Cmd