Fall back to forced cleanup when stop hangs
Some guests ignore CtrlAltDel long enough that vm stop sat in wait_for_exit until the client disconnected, leaving the daemon to report context canceled instead of actually stopping the VM. Keep the graceful shutdown attempt, but treat a grace-window timeout as a fallback case: log the timeout stage and continue into the existing forced cleanup path instead of returning immediately. Add a fake Firecracker API/process regression that exercises the timeout-to-cleanup transition. Validate with go test ./..., make build, and a live ./banger vm stop ea run that now completes instead of hanging.
This commit is contained in:
parent
3a336f0eeb
commit
2d5bcb5516
2 changed files with 101 additions and 4 deletions
|
|
@ -18,6 +18,11 @@ import (
|
|||
"banger/internal/vmdns"
|
||||
)
|
||||
|
||||
var (
|
||||
errWaitForExitTimeout = errors.New("timed out waiting for VM to exit")
|
||||
gracefulShutdownWait = 10 * time.Second
|
||||
)
|
||||
|
||||
func (d *Daemon) CreateVM(ctx context.Context, params api.VMCreateParams) (vm model.VMRecord, err error) {
|
||||
d.mu.Lock()
|
||||
defer d.mu.Unlock()
|
||||
|
|
@ -320,8 +325,11 @@ func (d *Daemon) StopVM(ctx context.Context, idOrName string) (vm model.VMRecord
|
|||
return model.VMRecord{}, err
|
||||
}
|
||||
op.stage("wait_for_exit", "pid", vm.Runtime.PID)
|
||||
if err := d.waitForExit(ctx, vm.Runtime.PID, vm.Runtime.APISockPath, 30*time.Second); err != nil {
|
||||
return model.VMRecord{}, err
|
||||
if err := d.waitForExit(ctx, vm.Runtime.PID, vm.Runtime.APISockPath, gracefulShutdownWait); err != nil {
|
||||
if !errors.Is(err, errWaitForExitTimeout) {
|
||||
return model.VMRecord{}, err
|
||||
}
|
||||
op.stage("graceful_shutdown_timeout", "pid", vm.Runtime.PID)
|
||||
}
|
||||
op.stage("cleanup_runtime")
|
||||
if err := d.cleanupRuntime(ctx, vm, true); err != nil {
|
||||
|
|
@ -377,7 +385,10 @@ func (d *Daemon) KillVM(ctx context.Context, params api.VMKillParams) (vm model.
|
|||
}
|
||||
op.stage("wait_for_exit", "pid", vm.Runtime.PID)
|
||||
if err := d.waitForExit(ctx, vm.Runtime.PID, vm.Runtime.APISockPath, 30*time.Second); err != nil {
|
||||
return model.VMRecord{}, err
|
||||
if !errors.Is(err, errWaitForExitTimeout) {
|
||||
return model.VMRecord{}, err
|
||||
}
|
||||
op.stage("signal_timeout", "pid", vm.Runtime.PID, "signal", signal)
|
||||
}
|
||||
op.stage("cleanup_runtime")
|
||||
if err := d.cleanupRuntime(ctx, vm, true); err != nil {
|
||||
|
|
@ -810,7 +821,7 @@ func (d *Daemon) waitForExit(ctx context.Context, pid int, apiSock string, timeo
|
|||
return nil
|
||||
}
|
||||
if time.Now().After(deadline) {
|
||||
return fmt.Errorf("timed out waiting for VM to exit")
|
||||
return errWaitForExitTimeout
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue