daemon: fix vm start (on a stopped VM) + regression coverage

Two defects compounded to make `vm create X` → `vm stop X` → `vm start X` → `vm ssh X` fail with `not_running: vm X is not running` even though `vm show` reports `state=running`. 1. firecracker-go-sdk's startVMM spawns a goroutine that SIGTERMs firecracker when the ctx passed to Machine.Start cancels — and retains that ctx for the lifetime of the VMM, not just the boot phase. Our Machine.Start wrapper was plumbing the caller's ctx through, which on `vm.start` is the RPC request ctx. daemon.go's handleConn cancels reqCtx via `defer cancel()` right after writing the response. Net effect: firecracker is killed ~150ms after the `vm start` RPC "completes", invisibly, and the next `vm ssh` sees a dead PID. `vm.create` side-stepped the bug because BeginVMCreate detaches to context.Background() before calling startVMLocked; `vm.start` used the RPC ctx directly. Fix: Machine.Start now passes context.Background() to the SDK. We own firecracker lifecycle explicitly (StopVM / KillVM / cleanupRuntime), so ctx-driven cancellation here was never actually wired into anything useful. 2. With (1) fixed, the same scenario exposed a second defect: patchRootOverlay's e2cp/e2rm refuses to touch the dm-snapshot with "Inode bitmap checksum does not match bitmap" on a restart, because the COW holds stale free-block/free-inode counters from the previous guest boot. Kernel ext4 is fine with this; e2fsprogs is not. Fix: run `e2fsck -fy` on the snapshot between the dm_snapshot and patch_root_overlay stages. Idempotent on a fresh snapshot, reconciles the bitmaps on a reused COW. Regression coverage: - scripts/repro-restart-bug.sh — minimal create→stop→start→ssh reproducer with rich on-failure diagnostics (daemon log trace, firecracker.log tail, handles.json, pgrep-by-apiSock, apiSock stat). Exits non-zero if the bug returns. - scripts/smoke.sh — lifecycle scenario (create/ssh/stop/start/ ssh/delete) and vm-set scenario (--vcpu 2 → stop → set --vcpu 4 → start → assert nproc=4). Both were pulled when the bug was first found; now restored. Supporting: - internal/system/system.ExitCode — extracts exec.ExitError's code without forcing callers to import os/exec. Needed by the e2fsck caller (policy test pins os/exec to the shell-out packages). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-23 12:01:46 -03:00 · 2026-04-23 12:01:46 -03:00 · b4afe13b2a
commit b4afe13b2a
parent e94e7c4dcc
5 changed files with 303 additions and 1 deletions
--- a/internal/daemon/vm_lifecycle.go
+++ b/internal/daemon/vm_lifecycle.go
@ -3,6 +3,7 @@ package daemon
 import (
 	"context"
 	"errors"
+	"fmt"
 	"os"
 	"path/filepath"
 	"strconv"
@ -127,6 +128,26 @@ func (s *VMService) startVMLocked(ctx context.Context, vm model.VMRecord, image
 		return model.VMRecord{}, err
 	}

+	// On a restart the COW already holds writes from a previous guest
+	// boot — stale free-inode / free-block counters, possibly unwritten
+	// journal updates. e2fsprogs (e2cp/e2rm, used by patchRootOverlay)
+	// refuses to touch the snapshot with "Inode bitmap checksum does
+	// not match bitmap", which bubbles up as a "start failed" even
+	// though the filesystem is kernel-valid. `e2fsck -fy` reconciles
+	// the bitmaps and is a no-op on a fresh snapshot, so running it
+	// unconditionally keeps the code path the same for first vs.
+	// subsequent starts. Exit code 1 means "errors fixed" — we treat
+	// that as success.
+	op.stage("fsck_snapshot")
+	if _, err := s.runner.RunSudo(ctx, "e2fsck", "-fy", live.DMDev); err != nil {
+		// e2fsck exit codes: 0=clean, 1=errors corrected, 2=reboot
+		// needed, 4+=uncorrected. -1 means the error wasn't an
+		// exec.ExitError (e.g. command not found, ctx cancel).
+		if code := system.ExitCode(err); code < 0 || code > 1 {
+			return cleanupOnErr(fmt.Errorf("fsck snapshot: %w", err))
+		}
+	}
+
 	op.stage("patch_root_overlay")
 	vmCreateStage(ctx, "prepare_rootfs", "writing guest configuration")
 	if err := s.patchRootOverlay(ctx, vm, image); err != nil {
--- a/internal/firecracker/client.go
+++ b/internal/firecracker/client.go
@ -75,10 +75,28 @@ func NewMachine(ctx context.Context, cfg MachineConfig) (*Machine, error) {
 }

 func (m *Machine) Start(ctx context.Context) error {
-	if err := m.machine.Start(ctx); err != nil {
+	// The caller's ctx is INTENTIONALLY not forwarded to the SDK.
+	// firecracker-go-sdk's startVMM (machine.go) spawns a goroutine
+	// that SIGTERMs firecracker the instant this ctx cancels, and
+	// retains it for the lifetime of the VMM — not just the boot
+	// phase. Plumbing an RPC request ctx through would mean
+	// firecracker dies the moment the daemon writes its RPC response
+	// (daemon.go:handleConn defers cancel). That silently breaks
+	// `vm start` on a stopped VM: start "succeeds", the handler
+	// returns, ctx cancels, firecracker is SIGTERMed, and the next
+	// `vm ssh` hits `vmAlive = false`. `vm.create` sidesteps the bug
+	// because BeginVMCreate detaches to a background ctx before
+	// calling startVMLocked.
+	//
+	// We own firecracker lifecycle explicitly — StopVM / KillVM /
+	// cleanupRuntime — so losing ctx-driven cancellation here is
+	// deliberate. The SDK still enforces its own boot-phase timeouts
+	// (socket wait, HTTP) with internal deadlines.
+	if err := m.machine.Start(context.Background()); err != nil {
 		m.closeLog()
 		return err
 	}
+	_ = ctx

 	go func() {
 		_ = m.machine.Wait(context.Background())
--- a/internal/system/system.go
+++ b/internal/system/system.go
@ -39,6 +39,22 @@ func NewRunner() Runner {
 	return Runner{}
 }

+// ExitCode extracts the process exit code from an error returned by
+// Run/RunSudo. Returns -1 when the error isn't an *exec.ExitError
+// (e.g. a context cancellation, the command wasn't found). Exposing
+// this here keeps daemon-level callers out of os/exec — the
+// shellout-policy test rejects direct imports outside system/cli/etc.
+func ExitCode(err error) int {
+	if err == nil {
+		return 0
+	}
+	var exitErr *exec.ExitError
+	if errors.As(err, &exitErr) {
+		return exitErr.ExitCode()
+	}
+	return -1
+}
+
 func (Runner) Run(ctx context.Context, name string, args ...string) ([]byte, error) {
 	cmd := exec.CommandContext(ctx, name, args...)
 	var stdout bytes.Buffer