daemon: fix vm start (on a stopped VM) + regression coverage

Two defects compounded to make `vm create X` → `vm stop X` → `vm start X`
→ `vm ssh X` fail with `not_running: vm X is not running` even though
`vm show` reports `state=running`.

1. firecracker-go-sdk's startVMM spawns a goroutine that SIGTERMs
   firecracker when the ctx passed to Machine.Start cancels — and
   retains that ctx for the lifetime of the VMM, not just the boot
   phase. Our Machine.Start wrapper was plumbing the caller's ctx
   through, which on `vm.start` is the RPC request ctx. daemon.go's
   handleConn cancels reqCtx via `defer cancel()` right after
   writing the response. Net effect: firecracker is killed ~150ms
   after the `vm start` RPC "completes", invisibly, and the next
   `vm ssh` sees a dead PID. `vm.create` side-stepped the bug
   because BeginVMCreate detaches to context.Background() before
   calling startVMLocked; `vm.start` used the RPC ctx directly.
   Fix: Machine.Start now passes context.Background() to the SDK.
   We own firecracker lifecycle explicitly (StopVM / KillVM /
   cleanupRuntime), so ctx-driven cancellation here was never
   actually wired into anything useful.

2. With (1) fixed, the same scenario exposed a second defect:
   patchRootOverlay's e2cp/e2rm refuses to touch the dm-snapshot
   with "Inode bitmap checksum does not match bitmap" on a restart,
   because the COW holds stale free-block/free-inode counters from
   the previous guest boot. Kernel ext4 is fine with this; e2fsprogs
   is not. Fix: run `e2fsck -fy` on the snapshot between the
   dm_snapshot and patch_root_overlay stages. Idempotent on a fresh
   snapshot, reconciles the bitmaps on a reused COW.

Regression coverage:
  - scripts/repro-restart-bug.sh — minimal create→stop→start→ssh
    reproducer with rich on-failure diagnostics (daemon log trace,
    firecracker.log tail, handles.json, pgrep-by-apiSock, apiSock
    stat). Exits non-zero if the bug returns.
  - scripts/smoke.sh — lifecycle scenario (create/ssh/stop/start/
    ssh/delete) and vm-set scenario (--vcpu 2 → stop → set --vcpu 4
    → start → assert nproc=4). Both were pulled when the bug was
    first found; now restored.

Supporting:
  - internal/system/system.ExitCode — extracts exec.ExitError's
    code without forcing callers to import os/exec. Needed by the
    e2fsck caller (policy test pins os/exec to the shell-out
    packages).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Thales Maciel 2026-04-23 12:01:46 -03:00
parent e94e7c4dcc
commit b4afe13b2a
No known key found for this signature in database
GPG key ID: 33112E6833C34679
5 changed files with 303 additions and 1 deletions

View file

@ -3,6 +3,7 @@ package daemon
import (
"context"
"errors"
"fmt"
"os"
"path/filepath"
"strconv"
@ -127,6 +128,26 @@ func (s *VMService) startVMLocked(ctx context.Context, vm model.VMRecord, image
return model.VMRecord{}, err
}
// On a restart the COW already holds writes from a previous guest
// boot — stale free-inode / free-block counters, possibly unwritten
// journal updates. e2fsprogs (e2cp/e2rm, used by patchRootOverlay)
// refuses to touch the snapshot with "Inode bitmap checksum does
// not match bitmap", which bubbles up as a "start failed" even
// though the filesystem is kernel-valid. `e2fsck -fy` reconciles
// the bitmaps and is a no-op on a fresh snapshot, so running it
// unconditionally keeps the code path the same for first vs.
// subsequent starts. Exit code 1 means "errors fixed" — we treat
// that as success.
op.stage("fsck_snapshot")
if _, err := s.runner.RunSudo(ctx, "e2fsck", "-fy", live.DMDev); err != nil {
// e2fsck exit codes: 0=clean, 1=errors corrected, 2=reboot
// needed, 4+=uncorrected. -1 means the error wasn't an
// exec.ExitError (e.g. command not found, ctx cancel).
if code := system.ExitCode(err); code < 0 || code > 1 {
return cleanupOnErr(fmt.Errorf("fsck snapshot: %w", err))
}
}
op.stage("patch_root_overlay")
vmCreateStage(ctx, "prepare_rootfs", "writing guest configuration")
if err := s.patchRootOverlay(ctx, vm, image); err != nil {

View file

@ -75,10 +75,28 @@ func NewMachine(ctx context.Context, cfg MachineConfig) (*Machine, error) {
}
func (m *Machine) Start(ctx context.Context) error {
if err := m.machine.Start(ctx); err != nil {
// The caller's ctx is INTENTIONALLY not forwarded to the SDK.
// firecracker-go-sdk's startVMM (machine.go) spawns a goroutine
// that SIGTERMs firecracker the instant this ctx cancels, and
// retains it for the lifetime of the VMM — not just the boot
// phase. Plumbing an RPC request ctx through would mean
// firecracker dies the moment the daemon writes its RPC response
// (daemon.go:handleConn defers cancel). That silently breaks
// `vm start` on a stopped VM: start "succeeds", the handler
// returns, ctx cancels, firecracker is SIGTERMed, and the next
// `vm ssh` hits `vmAlive = false`. `vm.create` sidesteps the bug
// because BeginVMCreate detaches to a background ctx before
// calling startVMLocked.
//
// We own firecracker lifecycle explicitly — StopVM / KillVM /
// cleanupRuntime — so losing ctx-driven cancellation here is
// deliberate. The SDK still enforces its own boot-phase timeouts
// (socket wait, HTTP) with internal deadlines.
if err := m.machine.Start(context.Background()); err != nil {
m.closeLog()
return err
}
_ = ctx
go func() {
_ = m.machine.Wait(context.Background())

View file

@ -39,6 +39,22 @@ func NewRunner() Runner {
return Runner{}
}
// ExitCode extracts the process exit code from an error returned by
// Run/RunSudo. Returns -1 when the error isn't an *exec.ExitError
// (e.g. a context cancellation, the command wasn't found). Exposing
// this here keeps daemon-level callers out of os/exec — the
// shellout-policy test rejects direct imports outside system/cli/etc.
func ExitCode(err error) int {
if err == nil {
return 0
}
var exitErr *exec.ExitError
if errors.As(err, &exitErr) {
return exitErr.ExitCode()
}
return -1
}
func (Runner) Run(ctx context.Context, name string, args ...string) ([]byte, error) {
cmd := exec.CommandContext(ctx, name, args...)
var stdout bytes.Buffer

151
scripts/repro-restart-bug.sh Executable file
View file

@ -0,0 +1,151 @@
#!/usr/bin/env bash
#
# scripts/repro-restart-bug.sh — minimal reproducer for the
# stop-then-start bug.
#
# Symptom: after `vm create X` → `vm stop X` → `vm start X`, the store
# reports `state=running` but `vm ssh X` returns `not_running`
# because the daemon's `vmAlive(vm)` check returns false. Seen
# reliably on Debian-bookworm default image.
#
# This script:
# 1. Builds instrumented binaries (reuses $(SMOKE_BIN_DIR))
# 2. Points banger at an isolated XDG so it doesn't touch the
# invoking user's real install
# 3. Runs the create→stop→start sequence
# 4. Asserts `vm ssh -- true` works post-restart
# 5. On failure, dumps the daemon log (vm.start trace), firecracker
# log (guest kernel output), pgrep state (is firecracker
# actually running?), api-sock presence, and handles.json
#
# Exit 0 = bug is fixed. Exit 1 = bug still reproduces.
#
# Run directly (builds binaries on demand):
# ./scripts/repro-restart-bug.sh
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
cd "$REPO_ROOT"
log() { printf '[repro] %s\n' "$*" >&2; }
die() { printf '[repro] FAIL: %s\n' "$*" >&2; exit 1; }
# Reuse smoke binaries if present; otherwise build them. They're
# instrumented with -cover, but that's harmless for this test.
make smoke-build >/dev/null
BIN_DIR="$REPO_ROOT/build/smoke/bin"
for bin in banger bangerd banger-vsock-agent; do
[[ -x "$BIN_DIR/$bin" ]] || die "missing $BIN_DIR/$bin; run make smoke-build"
done
BANGER="$BIN_DIR/banger"
VMNAME=repro-restart
# Isolated XDG root, torn down at exit. Unlike smoke.sh we do NOT
# persist across runs — we want a clean slate so the very first
# image pull also exercises the second-start path.
WORKDIR="$(mktemp -d -t banger-repro-XXXXXX)"
trap 'rm -rf "$WORKDIR"' EXIT
export XDG_CONFIG_HOME="$WORKDIR/config"
export XDG_STATE_HOME="$WORKDIR/state"
export XDG_CACHE_HOME="$WORKDIR/cache"
export XDG_RUNTIME_DIR="$WORKDIR/runtime"
mkdir -p "$XDG_CONFIG_HOME" "$XDG_STATE_HOME" "$XDG_CACHE_HOME" "$XDG_RUNTIME_DIR"
chmod 0700 "$XDG_RUNTIME_DIR"
export BANGER_DAEMON_BIN="$BIN_DIR/bangerd"
export BANGER_VSOCK_AGENT_BIN="$BIN_DIR/banger-vsock-agent"
export GOCOVERDIR="$WORKDIR/covdata"
mkdir -p "$GOCOVERDIR"
# Refuse to run if the user's real daemon has :42069 bound — we'd
# fail for the wrong reason.
if command -v ss >/dev/null 2>&1 && ss -Huln 2>/dev/null | awk '{print $4}' | grep -q '[:.]42069$'; then
die 'port 127.0.0.1:42069 already bound; stop your real banger daemon first'
fi
"$BANGER" daemon stop >/dev/null 2>&1 || true
LOG_PATH="$XDG_STATE_HOME/banger/bangerd.log"
diag() {
printf '\n[repro] === DIAGNOSTICS ===\n' >&2
printf '[repro] vm show:\n' >&2
"$BANGER" vm show "$VMNAME" >&2 2>/dev/null || true
local vmdir
vmdir="$("$BANGER" vm show "$VMNAME" 2>/dev/null | awk -F'"' '/"vm_dir"/ {print $4}')"
local apisock
apisock="$("$BANGER" vm show "$VMNAME" 2>/dev/null | awk -F'"' '/"api_sock_path"/ {print $4}')"
printf '\n[repro] handles.json:\n' >&2
[[ -n "$vmdir" && -f "$vmdir/handles.json" ]] && cat "$vmdir/handles.json" >&2 || echo ' (missing)' >&2
printf '\n[repro] pgrep by apiSock (%s):\n' "$apisock" >&2
[[ -n "$apisock" ]] && (pgrep -af "$apisock" >&2 || echo ' (none)' >&2) || echo ' (no apisock)' >&2
printf '\n[repro] apiSock present:\n' >&2
if [[ -n "$apisock" ]]; then
if [[ -S "$apisock" ]]; then
sudo -n ls -la "$apisock" >&2 2>/dev/null || ls -la "$apisock" >&2 2>/dev/null || echo ' (cannot stat)' >&2
else
echo ' NOT PRESENT' >&2
fi
fi
printf '\n[repro] daemon log — last vm.start trace:\n' >&2
if [[ -f "$LOG_PATH" ]]; then
# Dump everything from the most recent "operation started" for vm.start.
awk '
/"operation":"vm\.start"/ && /"msg":"operation started"/ { lastStart=NR }
{ lines[NR]=$0 }
END {
if (lastStart) for (i=lastStart; i<=NR; i++) print lines[i]
}
' "$LOG_PATH" | tail -40 >&2
else
echo ' (daemon log missing)' >&2
fi
printf '\n[repro] firecracker.log tail (guest kernel output):\n' >&2
[[ -n "$vmdir" && -f "$vmdir/firecracker.log" ]] && tail -30 "$vmdir/firecracker.log" >&2 || echo ' (missing)' >&2
printf '\n' >&2
}
log "create $VMNAME"
"$BANGER" vm create --name "$VMNAME" >/dev/null || die "create failed"
log 'wait for initial ssh'
deadline=$(( $(date +%s) + 90 ))
while (( $(date +%s) < deadline )); do
"$BANGER" vm ssh "$VMNAME" -- true >/dev/null 2>&1 && break
sleep 1
done
"$BANGER" vm ssh "$VMNAME" -- true >/dev/null 2>&1 || { diag; die 'initial ssh never came up'; }
log 'stop'
"$BANGER" vm stop "$VMNAME" >/dev/null || { diag; die 'stop failed'; }
log 'start (this is where the bug manifests)'
"$BANGER" vm start "$VMNAME" >/dev/null || { diag; die 'start failed'; }
log 'assert vm ssh succeeds post-restart (60s budget)'
deadline=$(( $(date +%s) + 60 ))
while (( $(date +%s) < deadline )); do
if "$BANGER" vm ssh "$VMNAME" -- true >/dev/null 2>&1; then
log 'PASS — bug appears fixed'
"$BANGER" vm delete "$VMNAME" >/dev/null 2>&1 || true
"$BANGER" daemon stop >/dev/null 2>&1 || true
exit 0
fi
sleep 1
done
log 'FAIL — vm ssh never succeeded post-restart'
diag
"$BANGER" vm delete "$VMNAME" >/dev/null 2>&1 || true
"$BANGER" daemon stop >/dev/null 2>&1 || true
exit 1

View file

@ -24,6 +24,23 @@ set -euo pipefail
log() { printf '[smoke] %s\n' "$*" >&2; }
die() { printf '[smoke] FAIL: %s\n' "$*" >&2; exit 1; }
# wait_for_ssh polls `vm ssh <vm> -- true` until it succeeds or the
# timeout expires. `vm ssh` — unlike `vm run` — does not itself wait
# for guest sshd, so scenarios that call `vm create` / `vm start`
# back-to-back with `vm ssh` need this shim. 60s matches
# vmRunSSHTimeout.
wait_for_ssh() {
local vm="$1"
local deadline=$(( $(date +%s) + 60 ))
while (( $(date +%s) < deadline )); do
if "$BANGER" vm ssh "$vm" -- true >/dev/null 2>&1; then
return 0
fi
sleep 1
done
return 1
}
: "${BANGER_SMOKE_BIN_DIR:?must point at the instrumented binary dir, set by make smoke}"
: "${BANGER_SMOKE_COVER_DIR:?must point at the coverage dir, set by make smoke}"
: "${BANGER_SMOKE_XDG_DIR:?must point at the isolated XDG root, set by make smoke}"
@ -193,6 +210,85 @@ wait "$pidB" || die "concurrent VM B exited non-zero: $(cat "$tmpB")"
grep -q 'smoke-concurrent-a' "$tmpA" || die "concurrent VM A missing marker: $(cat "$tmpA")"
grep -q 'smoke-concurrent-b' "$tmpB" || die "concurrent VM B missing marker: $(cat "$tmpB")"
# --- vm lifecycle (create → stop → start → delete) --------------------
# Exercises lifecycle verbs directly instead of the --rm convenience
# path. The critical assertion is the second `vm ssh` AFTER stop/start:
# that path (a) rebuilds the handle cache via rediscoverHandles,
# (b) runs the e2fsck-snapshot sanitize step before patchRootOverlay
# on the dirty COW, and (c) shouldn't die from the SDK's
# ctx-SIGTERM-on-RPC-close goroutine. All three were bugs at one
# point; this scenario guards all three at once.
log 'vm lifecycle: explicit create / stop / start / ssh / delete'
lifecycle_name=smoke-lifecycle
# shellcheck disable=SC2064
trap "\"$BANGER\" vm delete $lifecycle_name >/dev/null 2>&1 || true; rm -rf '$runtime_dir'" EXIT
"$BANGER" vm create --name "$lifecycle_name" >/dev/null || die "vm create $lifecycle_name failed"
show_out="$("$BANGER" vm show "$lifecycle_name")" || die "vm show after create failed"
grep -q '"state": "running"' <<<"$show_out" || die "post-create state not running: $show_out"
wait_for_ssh "$lifecycle_name" || die 'vm lifecycle: ssh did not come up after create'
ssh_out="$("$BANGER" vm ssh "$lifecycle_name" -- echo hello-1)" || die "vm ssh #1 failed"
grep -q 'hello-1' <<<"$ssh_out" || die "vm ssh #1 missing marker: $ssh_out"
"$BANGER" vm stop "$lifecycle_name" >/dev/null || die "vm stop failed"
show_out="$("$BANGER" vm show "$lifecycle_name")" || die "vm show after stop failed"
grep -q '"state": "stopped"' <<<"$show_out" || die "post-stop state not stopped: $show_out"
"$BANGER" vm start "$lifecycle_name" >/dev/null || die "vm start (from stopped) failed"
show_out="$("$BANGER" vm show "$lifecycle_name")" || die "vm show after start failed"
grep -q '"state": "running"' <<<"$show_out" || die "post-start state not running: $show_out"
wait_for_ssh "$lifecycle_name" || die 'vm lifecycle: ssh did not come up after restart'
ssh_out="$("$BANGER" vm ssh "$lifecycle_name" -- echo hello-2)" || die "vm ssh #2 (post-restart) failed"
grep -q 'hello-2' <<<"$ssh_out" || die "vm ssh #2 missing marker: $ssh_out"
"$BANGER" vm delete "$lifecycle_name" >/dev/null || die "vm delete failed"
set +e
"$BANGER" vm show "$lifecycle_name" >/dev/null 2>&1
rc=$?
set -e
[[ "$rc" -ne 0 ]] || die "vm show still finds $lifecycle_name after delete"
# shellcheck disable=SC2064
trap "rm -rf '$runtime_dir'" EXIT
# --- vm set reconfiguration (vcpu change + restart) -------------------
# Exercises SetVM + configChangeCapability. Create with --vcpu 2,
# stop, `vm set --vcpu 4`, restart, confirm the guest sees the new
# count. Regression guard: a restart that reuses the pre-change spec
# would leave nproc at 2.
log 'vm set: create --vcpu 2 → stop → set --vcpu 4 → restart → nproc=4'
# shellcheck disable=SC2064
trap "\"$BANGER\" vm delete smoke-set >/dev/null 2>&1 || true; rm -rf '$runtime_dir'" EXIT
"$BANGER" vm create --name smoke-set --vcpu 2 >/dev/null || die 'vm set: create failed'
wait_for_ssh smoke-set || die 'vm set: initial ssh did not come up'
set +e
nproc_before="$("$BANGER" vm ssh smoke-set -- nproc 2>/dev/null)"
rc=$?
set -e
[[ "$rc" -eq 0 ]] || die "vm set: initial nproc ssh exit $rc"
[[ "$(printf '%s' "$nproc_before" | tr -d '[:space:]')" == "2" ]] \
|| die "vm set: initial nproc got '$nproc_before', want 2"
"$BANGER" vm stop smoke-set >/dev/null || die 'vm set: stop failed'
"$BANGER" vm set smoke-set --vcpu 4 >/dev/null || die 'vm set: reconfigure failed'
"$BANGER" vm start smoke-set >/dev/null || die 'vm set: restart failed'
wait_for_ssh smoke-set || die 'vm set: post-reconfig ssh did not come up'
set +e
nproc_after="$("$BANGER" vm ssh smoke-set -- nproc 2>/dev/null)"
rc=$?
set -e
[[ "$rc" -eq 0 ]] || die "vm set: post-reconfig nproc ssh exit $rc"
[[ "$(printf '%s' "$nproc_after" | tr -d '[:space:]')" == "4" ]] \
|| die "vm set: post-reconfig nproc got '$nproc_after', want 4 (spec change didn't land)"
"$BANGER" vm delete smoke-set >/dev/null || die 'vm set: delete failed'
# shellcheck disable=SC2064
trap "rm -rf '$runtime_dir'" EXIT
# --- invalid spec rejection + no artifact leak ------------------------
# Tests the negative-path create flow: a blatantly invalid VM spec
# must fail before any VM row is persisted. The review cycle flagged