diff --git a/internal/daemon/vm_lifecycle.go b/internal/daemon/vm_lifecycle.go index e759bc6..ca0aad7 100644 --- a/internal/daemon/vm_lifecycle.go +++ b/internal/daemon/vm_lifecycle.go @@ -131,44 +131,27 @@ func (s *VMService) stopVMLocked(ctx context.Context, current model.VMRecord) (v } return vm, nil } - pid := s.vmHandles(vm.ID).PID op.stage("graceful_shutdown") - // Reach into the guest over SSH to force a sync + queue a poweroff - // before falling back on FC's SendCtrlAltDel. The sync is what - // keeps stop() from losing data: every dirty page the guest hasn't - // flushed through virtio-blk to the work disk is written out - // before this RPC returns. Without it, files freshly created via - // `vm workspace prepare` can disappear across stop+start, because - // the 10-second wait_for_exit window expires (FC doesn't exit on - // SendCtrlAltDel — Debian routes ctrl-alt-del.target → reboot.target, - // not poweroff) and the fallback SIGKILL drops everything still - // in FC's userspace I/O path. + // Reach into the guest over SSH to force a sync + queue a poweroff. + // The sync is what keeps stop() from losing data: every dirty page + // the guest hasn't flushed through virtio-blk to the work disk is + // written out before this RPC returns. Once sync completes, + // root.ext4 on the host is consistent and cleanupRuntime's SIGKILL + // is safe — there is no benefit to waiting for the guest's + // poweroff.target to finish, so we skip waitForExit entirely. // - // `systemctl --no-block poweroff` is queued for the same reason - // SendCtrlAltDel was here originally — it's how stop() asks the - // guest to halt. That request is best-effort; FC may or may not - // exit before the SIGKILL fallback fires. Either way, sync - // already ran, so the on-host root.ext4 is consistent regardless. - // - // SendCtrlAltDel survives as a fallback for guests where SSH - // itself is unreachable (broken sshd, network down, drifted host - // key); it doesn't fix the data-loss path, but it's the existing - // last-resort signal and is at least no worse than today. + // When SSH is unreachable (broken sshd, network down, drifted host + // key) we drop straight to SIGKILL via cleanupRuntime. The + // previous fallback was SendCtrlAltDel + a 10-second wait for FC + // to exit, but on Debian ctrl+alt+del routes to reboot.target, so + // FC never exits on it — the wait was always a wasted 10s. We pay + // the data-loss cost we already paid before (after the timeout + // expired the old code SIGKILLed too), but without the latency. if err := s.requestGuestPoweroff(ctx, vm); err != nil { if s.logger != nil { - s.logger.Warn("guest ssh poweroff failed; falling back to ctrl+alt+del", + s.logger.Warn("guest ssh poweroff failed; SIGKILL without sync", append(vmLogAttrs(vm), "error", err.Error())...) } - if fallbackErr := s.net.sendCtrlAltDel(ctx, vm.Runtime.APISockPath); fallbackErr != nil { - return model.VMRecord{}, fallbackErr - } - } - op.stage("wait_for_exit", "pid", pid) - if err := s.net.waitForExit(ctx, pid, vm.Runtime.APISockPath, gracefulShutdownWait); err != nil { - if !errors.Is(err, errWaitForExitTimeout) { - return model.VMRecord{}, err - } - op.stage("graceful_shutdown_timeout", "pid", pid) } op.stage("cleanup_runtime") if err := s.cleanupRuntime(ctx, vm, true); err != nil { @@ -190,16 +173,16 @@ func (s *VMService) stopVMLocked(ctx context.Context, current model.VMRecord) (v // comment in stopVMLocked. Returns the dial / SSH error if the guest // is unreachable; the caller treats that as a fallback signal. // -// Bounded by a hard 5-second SSH-dial timeout so a half-broken guest -// doesn't extend the overall stop window past the existing -// gracefulShutdownWait. If the dial doesn't succeed in that window we -// surface an error and let the caller take the SendCtrlAltDel path. +// Bounded by a hard 2-second SSH-dial timeout. A reachable guest on +// the host bridge dials in single-digit milliseconds; if we haven't +// connected in 2s the guest is effectively gone, so we fail fast and +// let the caller SIGKILL rather than burning latency on a doomed dial. func (s *VMService) requestGuestPoweroff(ctx context.Context, vm model.VMRecord) error { guestIP := strings.TrimSpace(vm.Runtime.GuestIP) if guestIP == "" { return errors.New("guest IP unknown") } - dialCtx, cancel := context.WithTimeout(ctx, 5*time.Second) + dialCtx, cancel := context.WithTimeout(ctx, 2*time.Second) defer cancel() address := net.JoinHostPort(guestIP, "22") client, err := guest.Dial(dialCtx, address, s.config.SSHKeyPath, s.layout.KnownHostsPath) diff --git a/internal/daemon/vm_test.go b/internal/daemon/vm_test.go index 131c55f..a747104 100644 --- a/internal/daemon/vm_test.go +++ b/internal/daemon/vm_test.go @@ -1592,7 +1592,7 @@ func TestDeleteStoppedNATVMDoesNotFailWithoutTapDevice(t *testing.T) { } } -func TestStopVMFallsBackToForcedCleanupAfterGracefulTimeout(t *testing.T) { +func TestStopVMSIGKILLsWhenSSHUnreachable(t *testing.T) { ctx := context.Background() db := openDaemonStore(t) apiSock := filepath.Join(t.TempDir(), "fc.sock") @@ -1606,12 +1606,6 @@ func TestStopVMFallsBackToForcedCleanupAfterGracefulTimeout(t *testing.T) { } }) - oldGracefulWait := gracefulShutdownWait - gracefulShutdownWait = 50 * time.Millisecond - t.Cleanup(func() { - gracefulShutdownWait = oldGracefulWait - }) - vm := testVM("stubborn", "image-stubborn", "172.16.0.23") vm.State = model.VMStateRunning vm.Runtime.State = model.VMStateRunning @@ -1622,8 +1616,6 @@ func TestStopVMFallsBackToForcedCleanupAfterGracefulTimeout(t *testing.T) { scriptedRunner: &scriptedRunner{ t: t, steps: []runnerStep{ - sudoStep("", nil, "chmod", "600", apiSock), - sudoStep("", nil, "chown", "-h", fmt.Sprintf("%d:%d", os.Getuid(), os.Getgid()), apiSock), {call: runnerCall{name: "pgrep", args: []string{"-n", "-f", apiSock}}, out: []byte(strconv.Itoa(fake.Process.Pid) + "\n")}, sudoStep("", nil, "kill", "-KILL", strconv.Itoa(fake.Process.Pid)), },