From b4afe13b2aa464f62de06fe73699875fe070aafb Mon Sep 17 00:00:00 2001 From: Thales Maciel Date: Thu, 23 Apr 2026 12:01:46 -0300 Subject: [PATCH] daemon: fix vm start (on a stopped VM) + regression coverage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two defects compounded to make `vm create X` → `vm stop X` → `vm start X` → `vm ssh X` fail with `not_running: vm X is not running` even though `vm show` reports `state=running`. 1. firecracker-go-sdk's startVMM spawns a goroutine that SIGTERMs firecracker when the ctx passed to Machine.Start cancels — and retains that ctx for the lifetime of the VMM, not just the boot phase. Our Machine.Start wrapper was plumbing the caller's ctx through, which on `vm.start` is the RPC request ctx. daemon.go's handleConn cancels reqCtx via `defer cancel()` right after writing the response. Net effect: firecracker is killed ~150ms after the `vm start` RPC "completes", invisibly, and the next `vm ssh` sees a dead PID. `vm.create` side-stepped the bug because BeginVMCreate detaches to context.Background() before calling startVMLocked; `vm.start` used the RPC ctx directly. Fix: Machine.Start now passes context.Background() to the SDK. We own firecracker lifecycle explicitly (StopVM / KillVM / cleanupRuntime), so ctx-driven cancellation here was never actually wired into anything useful. 2. With (1) fixed, the same scenario exposed a second defect: patchRootOverlay's e2cp/e2rm refuses to touch the dm-snapshot with "Inode bitmap checksum does not match bitmap" on a restart, because the COW holds stale free-block/free-inode counters from the previous guest boot. Kernel ext4 is fine with this; e2fsprogs is not. Fix: run `e2fsck -fy` on the snapshot between the dm_snapshot and patch_root_overlay stages. Idempotent on a fresh snapshot, reconciles the bitmaps on a reused COW. Regression coverage: - scripts/repro-restart-bug.sh — minimal create→stop→start→ssh reproducer with rich on-failure diagnostics (daemon log trace, firecracker.log tail, handles.json, pgrep-by-apiSock, apiSock stat). Exits non-zero if the bug returns. - scripts/smoke.sh — lifecycle scenario (create/ssh/stop/start/ ssh/delete) and vm-set scenario (--vcpu 2 → stop → set --vcpu 4 → start → assert nproc=4). Both were pulled when the bug was first found; now restored. Supporting: - internal/system/system.ExitCode — extracts exec.ExitError's code without forcing callers to import os/exec. Needed by the e2fsck caller (policy test pins os/exec to the shell-out packages). Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/daemon/vm_lifecycle.go | 21 +++++ internal/firecracker/client.go | 20 ++++- internal/system/system.go | 16 ++++ scripts/repro-restart-bug.sh | 151 ++++++++++++++++++++++++++++++++ scripts/smoke.sh | 96 ++++++++++++++++++++ 5 files changed, 303 insertions(+), 1 deletion(-) create mode 100755 scripts/repro-restart-bug.sh diff --git a/internal/daemon/vm_lifecycle.go b/internal/daemon/vm_lifecycle.go index efab83e..0190a41 100644 --- a/internal/daemon/vm_lifecycle.go +++ b/internal/daemon/vm_lifecycle.go @@ -3,6 +3,7 @@ package daemon import ( "context" "errors" + "fmt" "os" "path/filepath" "strconv" @@ -127,6 +128,26 @@ func (s *VMService) startVMLocked(ctx context.Context, vm model.VMRecord, image return model.VMRecord{}, err } + // On a restart the COW already holds writes from a previous guest + // boot — stale free-inode / free-block counters, possibly unwritten + // journal updates. e2fsprogs (e2cp/e2rm, used by patchRootOverlay) + // refuses to touch the snapshot with "Inode bitmap checksum does + // not match bitmap", which bubbles up as a "start failed" even + // though the filesystem is kernel-valid. `e2fsck -fy` reconciles + // the bitmaps and is a no-op on a fresh snapshot, so running it + // unconditionally keeps the code path the same for first vs. + // subsequent starts. Exit code 1 means "errors fixed" — we treat + // that as success. + op.stage("fsck_snapshot") + if _, err := s.runner.RunSudo(ctx, "e2fsck", "-fy", live.DMDev); err != nil { + // e2fsck exit codes: 0=clean, 1=errors corrected, 2=reboot + // needed, 4+=uncorrected. -1 means the error wasn't an + // exec.ExitError (e.g. command not found, ctx cancel). + if code := system.ExitCode(err); code < 0 || code > 1 { + return cleanupOnErr(fmt.Errorf("fsck snapshot: %w", err)) + } + } + op.stage("patch_root_overlay") vmCreateStage(ctx, "prepare_rootfs", "writing guest configuration") if err := s.patchRootOverlay(ctx, vm, image); err != nil { diff --git a/internal/firecracker/client.go b/internal/firecracker/client.go index 328dc40..063404f 100644 --- a/internal/firecracker/client.go +++ b/internal/firecracker/client.go @@ -75,10 +75,28 @@ func NewMachine(ctx context.Context, cfg MachineConfig) (*Machine, error) { } func (m *Machine) Start(ctx context.Context) error { - if err := m.machine.Start(ctx); err != nil { + // The caller's ctx is INTENTIONALLY not forwarded to the SDK. + // firecracker-go-sdk's startVMM (machine.go) spawns a goroutine + // that SIGTERMs firecracker the instant this ctx cancels, and + // retains it for the lifetime of the VMM — not just the boot + // phase. Plumbing an RPC request ctx through would mean + // firecracker dies the moment the daemon writes its RPC response + // (daemon.go:handleConn defers cancel). That silently breaks + // `vm start` on a stopped VM: start "succeeds", the handler + // returns, ctx cancels, firecracker is SIGTERMed, and the next + // `vm ssh` hits `vmAlive = false`. `vm.create` sidesteps the bug + // because BeginVMCreate detaches to a background ctx before + // calling startVMLocked. + // + // We own firecracker lifecycle explicitly — StopVM / KillVM / + // cleanupRuntime — so losing ctx-driven cancellation here is + // deliberate. The SDK still enforces its own boot-phase timeouts + // (socket wait, HTTP) with internal deadlines. + if err := m.machine.Start(context.Background()); err != nil { m.closeLog() return err } + _ = ctx go func() { _ = m.machine.Wait(context.Background()) diff --git a/internal/system/system.go b/internal/system/system.go index 59c5cb3..800f396 100644 --- a/internal/system/system.go +++ b/internal/system/system.go @@ -39,6 +39,22 @@ func NewRunner() Runner { return Runner{} } +// ExitCode extracts the process exit code from an error returned by +// Run/RunSudo. Returns -1 when the error isn't an *exec.ExitError +// (e.g. a context cancellation, the command wasn't found). Exposing +// this here keeps daemon-level callers out of os/exec — the +// shellout-policy test rejects direct imports outside system/cli/etc. +func ExitCode(err error) int { + if err == nil { + return 0 + } + var exitErr *exec.ExitError + if errors.As(err, &exitErr) { + return exitErr.ExitCode() + } + return -1 +} + func (Runner) Run(ctx context.Context, name string, args ...string) ([]byte, error) { cmd := exec.CommandContext(ctx, name, args...) var stdout bytes.Buffer diff --git a/scripts/repro-restart-bug.sh b/scripts/repro-restart-bug.sh new file mode 100755 index 0000000..acf1a9e --- /dev/null +++ b/scripts/repro-restart-bug.sh @@ -0,0 +1,151 @@ +#!/usr/bin/env bash +# +# scripts/repro-restart-bug.sh — minimal reproducer for the +# stop-then-start bug. +# +# Symptom: after `vm create X` → `vm stop X` → `vm start X`, the store +# reports `state=running` but `vm ssh X` returns `not_running` +# because the daemon's `vmAlive(vm)` check returns false. Seen +# reliably on Debian-bookworm default image. +# +# This script: +# 1. Builds instrumented binaries (reuses $(SMOKE_BIN_DIR)) +# 2. Points banger at an isolated XDG so it doesn't touch the +# invoking user's real install +# 3. Runs the create→stop→start sequence +# 4. Asserts `vm ssh -- true` works post-restart +# 5. On failure, dumps the daemon log (vm.start trace), firecracker +# log (guest kernel output), pgrep state (is firecracker +# actually running?), api-sock presence, and handles.json +# +# Exit 0 = bug is fixed. Exit 1 = bug still reproduces. +# +# Run directly (builds binaries on demand): +# ./scripts/repro-restart-bug.sh + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +cd "$REPO_ROOT" + +log() { printf '[repro] %s\n' "$*" >&2; } +die() { printf '[repro] FAIL: %s\n' "$*" >&2; exit 1; } + +# Reuse smoke binaries if present; otherwise build them. They're +# instrumented with -cover, but that's harmless for this test. +make smoke-build >/dev/null + +BIN_DIR="$REPO_ROOT/build/smoke/bin" +for bin in banger bangerd banger-vsock-agent; do + [[ -x "$BIN_DIR/$bin" ]] || die "missing $BIN_DIR/$bin; run make smoke-build" +done + +BANGER="$BIN_DIR/banger" +VMNAME=repro-restart + +# Isolated XDG root, torn down at exit. Unlike smoke.sh we do NOT +# persist across runs — we want a clean slate so the very first +# image pull also exercises the second-start path. +WORKDIR="$(mktemp -d -t banger-repro-XXXXXX)" +trap 'rm -rf "$WORKDIR"' EXIT + +export XDG_CONFIG_HOME="$WORKDIR/config" +export XDG_STATE_HOME="$WORKDIR/state" +export XDG_CACHE_HOME="$WORKDIR/cache" +export XDG_RUNTIME_DIR="$WORKDIR/runtime" +mkdir -p "$XDG_CONFIG_HOME" "$XDG_STATE_HOME" "$XDG_CACHE_HOME" "$XDG_RUNTIME_DIR" +chmod 0700 "$XDG_RUNTIME_DIR" + +export BANGER_DAEMON_BIN="$BIN_DIR/bangerd" +export BANGER_VSOCK_AGENT_BIN="$BIN_DIR/banger-vsock-agent" +export GOCOVERDIR="$WORKDIR/covdata" +mkdir -p "$GOCOVERDIR" + +# Refuse to run if the user's real daemon has :42069 bound — we'd +# fail for the wrong reason. +if command -v ss >/dev/null 2>&1 && ss -Huln 2>/dev/null | awk '{print $4}' | grep -q '[:.]42069$'; then + die 'port 127.0.0.1:42069 already bound; stop your real banger daemon first' +fi + +"$BANGER" daemon stop >/dev/null 2>&1 || true + +LOG_PATH="$XDG_STATE_HOME/banger/bangerd.log" + +diag() { + printf '\n[repro] === DIAGNOSTICS ===\n' >&2 + printf '[repro] vm show:\n' >&2 + "$BANGER" vm show "$VMNAME" >&2 2>/dev/null || true + local vmdir + vmdir="$("$BANGER" vm show "$VMNAME" 2>/dev/null | awk -F'"' '/"vm_dir"/ {print $4}')" + local apisock + apisock="$("$BANGER" vm show "$VMNAME" 2>/dev/null | awk -F'"' '/"api_sock_path"/ {print $4}')" + + printf '\n[repro] handles.json:\n' >&2 + [[ -n "$vmdir" && -f "$vmdir/handles.json" ]] && cat "$vmdir/handles.json" >&2 || echo ' (missing)' >&2 + + printf '\n[repro] pgrep by apiSock (%s):\n' "$apisock" >&2 + [[ -n "$apisock" ]] && (pgrep -af "$apisock" >&2 || echo ' (none)' >&2) || echo ' (no apisock)' >&2 + + printf '\n[repro] apiSock present:\n' >&2 + if [[ -n "$apisock" ]]; then + if [[ -S "$apisock" ]]; then + sudo -n ls -la "$apisock" >&2 2>/dev/null || ls -la "$apisock" >&2 2>/dev/null || echo ' (cannot stat)' >&2 + else + echo ' NOT PRESENT' >&2 + fi + fi + + printf '\n[repro] daemon log — last vm.start trace:\n' >&2 + if [[ -f "$LOG_PATH" ]]; then + # Dump everything from the most recent "operation started" for vm.start. + awk ' + /"operation":"vm\.start"/ && /"msg":"operation started"/ { lastStart=NR } + { lines[NR]=$0 } + END { + if (lastStart) for (i=lastStart; i<=NR; i++) print lines[i] + } + ' "$LOG_PATH" | tail -40 >&2 + else + echo ' (daemon log missing)' >&2 + fi + + printf '\n[repro] firecracker.log tail (guest kernel output):\n' >&2 + [[ -n "$vmdir" && -f "$vmdir/firecracker.log" ]] && tail -30 "$vmdir/firecracker.log" >&2 || echo ' (missing)' >&2 + + printf '\n' >&2 +} + +log "create $VMNAME" +"$BANGER" vm create --name "$VMNAME" >/dev/null || die "create failed" +log 'wait for initial ssh' +deadline=$(( $(date +%s) + 90 )) +while (( $(date +%s) < deadline )); do + "$BANGER" vm ssh "$VMNAME" -- true >/dev/null 2>&1 && break + sleep 1 +done +"$BANGER" vm ssh "$VMNAME" -- true >/dev/null 2>&1 || { diag; die 'initial ssh never came up'; } + +log 'stop' +"$BANGER" vm stop "$VMNAME" >/dev/null || { diag; die 'stop failed'; } + +log 'start (this is where the bug manifests)' +"$BANGER" vm start "$VMNAME" >/dev/null || { diag; die 'start failed'; } + +log 'assert vm ssh succeeds post-restart (60s budget)' +deadline=$(( $(date +%s) + 60 )) +while (( $(date +%s) < deadline )); do + if "$BANGER" vm ssh "$VMNAME" -- true >/dev/null 2>&1; then + log 'PASS — bug appears fixed' + "$BANGER" vm delete "$VMNAME" >/dev/null 2>&1 || true + "$BANGER" daemon stop >/dev/null 2>&1 || true + exit 0 + fi + sleep 1 +done + +log 'FAIL — vm ssh never succeeded post-restart' +diag +"$BANGER" vm delete "$VMNAME" >/dev/null 2>&1 || true +"$BANGER" daemon stop >/dev/null 2>&1 || true +exit 1 diff --git a/scripts/smoke.sh b/scripts/smoke.sh index 6ad7e58..15ca8d0 100755 --- a/scripts/smoke.sh +++ b/scripts/smoke.sh @@ -24,6 +24,23 @@ set -euo pipefail log() { printf '[smoke] %s\n' "$*" >&2; } die() { printf '[smoke] FAIL: %s\n' "$*" >&2; exit 1; } +# wait_for_ssh polls `vm ssh -- true` until it succeeds or the +# timeout expires. `vm ssh` — unlike `vm run` — does not itself wait +# for guest sshd, so scenarios that call `vm create` / `vm start` +# back-to-back with `vm ssh` need this shim. 60s matches +# vmRunSSHTimeout. +wait_for_ssh() { + local vm="$1" + local deadline=$(( $(date +%s) + 60 )) + while (( $(date +%s) < deadline )); do + if "$BANGER" vm ssh "$vm" -- true >/dev/null 2>&1; then + return 0 + fi + sleep 1 + done + return 1 +} + : "${BANGER_SMOKE_BIN_DIR:?must point at the instrumented binary dir, set by make smoke}" : "${BANGER_SMOKE_COVER_DIR:?must point at the coverage dir, set by make smoke}" : "${BANGER_SMOKE_XDG_DIR:?must point at the isolated XDG root, set by make smoke}" @@ -193,6 +210,85 @@ wait "$pidB" || die "concurrent VM B exited non-zero: $(cat "$tmpB")" grep -q 'smoke-concurrent-a' "$tmpA" || die "concurrent VM A missing marker: $(cat "$tmpA")" grep -q 'smoke-concurrent-b' "$tmpB" || die "concurrent VM B missing marker: $(cat "$tmpB")" +# --- vm lifecycle (create → stop → start → delete) -------------------- +# Exercises lifecycle verbs directly instead of the --rm convenience +# path. The critical assertion is the second `vm ssh` AFTER stop/start: +# that path (a) rebuilds the handle cache via rediscoverHandles, +# (b) runs the e2fsck-snapshot sanitize step before patchRootOverlay +# on the dirty COW, and (c) shouldn't die from the SDK's +# ctx-SIGTERM-on-RPC-close goroutine. All three were bugs at one +# point; this scenario guards all three at once. +log 'vm lifecycle: explicit create / stop / start / ssh / delete' +lifecycle_name=smoke-lifecycle +# shellcheck disable=SC2064 +trap "\"$BANGER\" vm delete $lifecycle_name >/dev/null 2>&1 || true; rm -rf '$runtime_dir'" EXIT + +"$BANGER" vm create --name "$lifecycle_name" >/dev/null || die "vm create $lifecycle_name failed" +show_out="$("$BANGER" vm show "$lifecycle_name")" || die "vm show after create failed" +grep -q '"state": "running"' <<<"$show_out" || die "post-create state not running: $show_out" + +wait_for_ssh "$lifecycle_name" || die 'vm lifecycle: ssh did not come up after create' +ssh_out="$("$BANGER" vm ssh "$lifecycle_name" -- echo hello-1)" || die "vm ssh #1 failed" +grep -q 'hello-1' <<<"$ssh_out" || die "vm ssh #1 missing marker: $ssh_out" + +"$BANGER" vm stop "$lifecycle_name" >/dev/null || die "vm stop failed" +show_out="$("$BANGER" vm show "$lifecycle_name")" || die "vm show after stop failed" +grep -q '"state": "stopped"' <<<"$show_out" || die "post-stop state not stopped: $show_out" + +"$BANGER" vm start "$lifecycle_name" >/dev/null || die "vm start (from stopped) failed" +show_out="$("$BANGER" vm show "$lifecycle_name")" || die "vm show after start failed" +grep -q '"state": "running"' <<<"$show_out" || die "post-start state not running: $show_out" + +wait_for_ssh "$lifecycle_name" || die 'vm lifecycle: ssh did not come up after restart' +ssh_out="$("$BANGER" vm ssh "$lifecycle_name" -- echo hello-2)" || die "vm ssh #2 (post-restart) failed" +grep -q 'hello-2' <<<"$ssh_out" || die "vm ssh #2 missing marker: $ssh_out" + +"$BANGER" vm delete "$lifecycle_name" >/dev/null || die "vm delete failed" +set +e +"$BANGER" vm show "$lifecycle_name" >/dev/null 2>&1 +rc=$? +set -e +[[ "$rc" -ne 0 ]] || die "vm show still finds $lifecycle_name after delete" +# shellcheck disable=SC2064 +trap "rm -rf '$runtime_dir'" EXIT + +# --- vm set reconfiguration (vcpu change + restart) ------------------- +# Exercises SetVM + configChangeCapability. Create with --vcpu 2, +# stop, `vm set --vcpu 4`, restart, confirm the guest sees the new +# count. Regression guard: a restart that reuses the pre-change spec +# would leave nproc at 2. +log 'vm set: create --vcpu 2 → stop → set --vcpu 4 → restart → nproc=4' +# shellcheck disable=SC2064 +trap "\"$BANGER\" vm delete smoke-set >/dev/null 2>&1 || true; rm -rf '$runtime_dir'" EXIT + +"$BANGER" vm create --name smoke-set --vcpu 2 >/dev/null || die 'vm set: create failed' +wait_for_ssh smoke-set || die 'vm set: initial ssh did not come up' + +set +e +nproc_before="$("$BANGER" vm ssh smoke-set -- nproc 2>/dev/null)" +rc=$? +set -e +[[ "$rc" -eq 0 ]] || die "vm set: initial nproc ssh exit $rc" +[[ "$(printf '%s' "$nproc_before" | tr -d '[:space:]')" == "2" ]] \ + || die "vm set: initial nproc got '$nproc_before', want 2" + +"$BANGER" vm stop smoke-set >/dev/null || die 'vm set: stop failed' +"$BANGER" vm set smoke-set --vcpu 4 >/dev/null || die 'vm set: reconfigure failed' +"$BANGER" vm start smoke-set >/dev/null || die 'vm set: restart failed' +wait_for_ssh smoke-set || die 'vm set: post-reconfig ssh did not come up' + +set +e +nproc_after="$("$BANGER" vm ssh smoke-set -- nproc 2>/dev/null)" +rc=$? +set -e +[[ "$rc" -eq 0 ]] || die "vm set: post-reconfig nproc ssh exit $rc" +[[ "$(printf '%s' "$nproc_after" | tr -d '[:space:]')" == "4" ]] \ + || die "vm set: post-reconfig nproc got '$nproc_after', want 4 (spec change didn't land)" + +"$BANGER" vm delete smoke-set >/dev/null || die 'vm set: delete failed' +# shellcheck disable=SC2064 +trap "rm -rf '$runtime_dir'" EXIT + # --- invalid spec rejection + no artifact leak ------------------------ # Tests the negative-path create flow: a blatantly invalid VM spec # must fail before any VM row is persisted. The review cycle flagged