daemon: cut vm stop latency

Three changes to stopVMLocked, biggest win first: - Skip waitForExit on the SSH-success path. sync inside the guest already flushed root.ext4, so cleanupRuntime's SIGKILL is safe immediately. Saves up to gracefulShutdownWait (10s) per stop. - Drop the SendCtrlAltDel + 10s wait fallback when SSH is unreachable. On Debian, ctrl+alt+del routes to reboot.target so FC never exits on it — the wait was pure latency. - Shrink the SSH dial timeout 5s → 2s. A reachable guest dials in single-digit milliseconds; if it doesn't, fail fast and SIGKILL. Worst-case (broken SSH) goes ~15s → ~2s + cleanup. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 17:51:22 -03:00 · 2026-05-03 17:51:22 -03:00 · 05439d2325
commit 05439d2325
parent c352aba50a
2 changed files with 21 additions and 46 deletions
--- a/internal/daemon/vm_lifecycle.go
+++ b/internal/daemon/vm_lifecycle.go
@ -131,44 +131,27 @@ func (s *VMService) stopVMLocked(ctx context.Context, current model.VMRecord) (v
 		}
 		return vm, nil
 	}
 	pid := s.vmHandles(vm.ID).PID
 	op.stage("graceful_shutdown")
-	// Reach into the guest over SSH to force a sync + queue a poweroff
+	// Reach into the guest over SSH to force a sync + queue a poweroff.
-	// before falling back on FC's SendCtrlAltDel. The sync is what
+	// The sync is what keeps stop() from losing data: every dirty page
-	// keeps stop() from losing data: every dirty page the guest hasn't
+	// the guest hasn't flushed through virtio-blk to the work disk is
-	// flushed through virtio-blk to the work disk is written out
+	// written out before this RPC returns. Once sync completes,
-	// before this RPC returns. Without it, files freshly created via
+	// root.ext4 on the host is consistent and cleanupRuntime's SIGKILL
-	// `vm workspace prepare` can disappear across stop+start, because
+	// is safe — there is no benefit to waiting for the guest's
-	// the 10-second wait_for_exit window expires (FC doesn't exit on
+	// poweroff.target to finish, so we skip waitForExit entirely.
 	// SendCtrlAltDel — Debian routes ctrl-alt-del.target → reboot.target,
 	// not poweroff) and the fallback SIGKILL drops everything still
 	// in FC's userspace I/O path.
 	//
-	// `systemctl --no-block poweroff` is queued for the same reason
+	// When SSH is unreachable (broken sshd, network down, drifted host
-	// SendCtrlAltDel was here originally — it's how stop() asks the
+	// key) we drop straight to SIGKILL via cleanupRuntime. The
-	// guest to halt. That request is best-effort; FC may or may not
+	// previous fallback was SendCtrlAltDel + a 10-second wait for FC
-	// exit before the SIGKILL fallback fires. Either way, sync
+	// to exit, but on Debian ctrl+alt+del routes to reboot.target, so
-	// already ran, so the on-host root.ext4 is consistent regardless.
+	// FC never exits on it — the wait was always a wasted 10s. We pay
-	//
+	// the data-loss cost we already paid before (after the timeout
-	// SendCtrlAltDel survives as a fallback for guests where SSH
+	// expired the old code SIGKILLed too), but without the latency.
 	// itself is unreachable (broken sshd, network down, drifted host
 	// key); it doesn't fix the data-loss path, but it's the existing
 	// last-resort signal and is at least no worse than today.
 	if err := s.requestGuestPoweroff(ctx, vm); err != nil {
 		if s.logger != nil {
-			s.logger.Warn("guest ssh poweroff failed; falling back to ctrl+alt+del",
+			s.logger.Warn("guest ssh poweroff failed; SIGKILL without sync",
 				append(vmLogAttrs(vm), "error", err.Error())...)
 		}
 		if fallbackErr := s.net.sendCtrlAltDel(ctx, vm.Runtime.APISockPath); fallbackErr != nil {
 			return model.VMRecord{}, fallbackErr
 		}
 	}
 	op.stage("wait_for_exit", "pid", pid)
 	if err := s.net.waitForExit(ctx, pid, vm.Runtime.APISockPath, gracefulShutdownWait); err != nil {
 		if !errors.Is(err, errWaitForExitTimeout) {
 			return model.VMRecord{}, err
 		}
 		op.stage("graceful_shutdown_timeout", "pid", pid)
 	}
 	op.stage("cleanup_runtime")
 	if err := s.cleanupRuntime(ctx, vm, true); err != nil {
@ -190,16 +173,16 @@ func (s *VMService) stopVMLocked(ctx context.Context, current model.VMRecord) (v
 // comment in stopVMLocked. Returns the dial / SSH error if the guest
 // is unreachable; the caller treats that as a fallback signal.
 //
-// Bounded by a hard 5-second SSH-dial timeout so a half-broken guest
+// Bounded by a hard 2-second SSH-dial timeout. A reachable guest on
-// doesn't extend the overall stop window past the existing
+// the host bridge dials in single-digit milliseconds; if we haven't
-// gracefulShutdownWait. If the dial doesn't succeed in that window we
+// connected in 2s the guest is effectively gone, so we fail fast and
-// surface an error and let the caller take the SendCtrlAltDel path.
+// let the caller SIGKILL rather than burning latency on a doomed dial.
 func (s *VMService) requestGuestPoweroff(ctx context.Context, vm model.VMRecord) error {
 	guestIP := strings.TrimSpace(vm.Runtime.GuestIP)
 	if guestIP == "" {
 		return errors.New("guest IP unknown")
 	}
-	dialCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
+	dialCtx, cancel := context.WithTimeout(ctx, 2*time.Second)
 	defer cancel()
 	address := net.JoinHostPort(guestIP, "22")
 	client, err := guest.Dial(dialCtx, address, s.config.SSHKeyPath, s.layout.KnownHostsPath)
--- a/internal/daemon/vm_test.go
+++ b/internal/daemon/vm_test.go
@ -1592,7 +1592,7 @@ func TestDeleteStoppedNATVMDoesNotFailWithoutTapDevice(t *testing.T) {
 	}
 }
-func TestStopVMFallsBackToForcedCleanupAfterGracefulTimeout(t *testing.T) {
+func TestStopVMSIGKILLsWhenSSHUnreachable(t *testing.T) {
 	ctx := context.Background()
 	db := openDaemonStore(t)
 	apiSock := filepath.Join(t.TempDir(), "fc.sock")
@ -1606,12 +1606,6 @@ func TestStopVMFallsBackToForcedCleanupAfterGracefulTimeout(t *testing.T) {
 		}
 	})
 	oldGracefulWait := gracefulShutdownWait
 	gracefulShutdownWait = 50 * time.Millisecond
 	t.Cleanup(func() {
 		gracefulShutdownWait = oldGracefulWait
 	})
 	vm := testVM("stubborn", "image-stubborn", "172.16.0.23")
 	vm.State = model.VMStateRunning
 	vm.Runtime.State = model.VMStateRunning
@ -1622,8 +1616,6 @@ func TestStopVMFallsBackToForcedCleanupAfterGracefulTimeout(t *testing.T) {
 		scriptedRunner: &scriptedRunner{
 			t: t,
 			steps: []runnerStep{
 				sudoStep("", nil, "chmod", "600", apiSock),
 				sudoStep("", nil, "chown", "-h", fmt.Sprintf("%d:%d", os.Getuid(), os.Getgid()), apiSock),
 				{call: runnerCall{name: "pgrep", args: []string{"-n", "-f", apiSock}}, out: []byte(strconv.Itoa(fake.Process.Pid) + "\n")},
 				sudoStep("", nil, "kill", "-KILL", strconv.Itoa(fake.Process.Pid)),
 			},