Fall back to forced cleanup when stop hangs

Some guests ignore CtrlAltDel long enough that vm stop sat in wait_for_exit until the client disconnected, leaving the daemon to report context canceled instead of actually stopping the VM.

Keep the graceful shutdown attempt, but treat a grace-window timeout as a fallback case: log the timeout stage and continue into the existing forced cleanup path instead of returning immediately. Add a fake Firecracker API/process regression that exercises the timeout-to-cleanup transition.

Validate with go test ./..., make build, and a live ./banger vm stop ea run that now completes instead of hanging.
This commit is contained in:
Thales Maciel 2026-03-18 13:44:59 -03:00
parent 3a336f0eeb
commit 2d5bcb5516
No known key found for this signature in database
GPG key ID: 33112E6833C34679
2 changed files with 101 additions and 4 deletions

View file

@ -18,6 +18,11 @@ import (
"banger/internal/vmdns"
)
var (
errWaitForExitTimeout = errors.New("timed out waiting for VM to exit")
gracefulShutdownWait = 10 * time.Second
)
func (d *Daemon) CreateVM(ctx context.Context, params api.VMCreateParams) (vm model.VMRecord, err error) {
d.mu.Lock()
defer d.mu.Unlock()
@ -320,8 +325,11 @@ func (d *Daemon) StopVM(ctx context.Context, idOrName string) (vm model.VMRecord
return model.VMRecord{}, err
}
op.stage("wait_for_exit", "pid", vm.Runtime.PID)
if err := d.waitForExit(ctx, vm.Runtime.PID, vm.Runtime.APISockPath, 30*time.Second); err != nil {
return model.VMRecord{}, err
if err := d.waitForExit(ctx, vm.Runtime.PID, vm.Runtime.APISockPath, gracefulShutdownWait); err != nil {
if !errors.Is(err, errWaitForExitTimeout) {
return model.VMRecord{}, err
}
op.stage("graceful_shutdown_timeout", "pid", vm.Runtime.PID)
}
op.stage("cleanup_runtime")
if err := d.cleanupRuntime(ctx, vm, true); err != nil {
@ -377,7 +385,10 @@ func (d *Daemon) KillVM(ctx context.Context, params api.VMKillParams) (vm model.
}
op.stage("wait_for_exit", "pid", vm.Runtime.PID)
if err := d.waitForExit(ctx, vm.Runtime.PID, vm.Runtime.APISockPath, 30*time.Second); err != nil {
return model.VMRecord{}, err
if !errors.Is(err, errWaitForExitTimeout) {
return model.VMRecord{}, err
}
op.stage("signal_timeout", "pid", vm.Runtime.PID, "signal", signal)
}
op.stage("cleanup_runtime")
if err := d.cleanupRuntime(ctx, vm, true); err != nil {
@ -810,7 +821,7 @@ func (d *Daemon) waitForExit(ctx context.Context, pid int, apiSock string, timeo
return nil
}
if time.Now().After(deadline) {
return fmt.Errorf("timed out waiting for VM to exit")
return errWaitForExitTimeout
}
select {
case <-ctx.Done():