#!/usr/bin/env bash # # scripts/repro-restart-bug.sh — minimal reproducer for the # stop-then-start bug. # # Symptom: after `vm create X` → `vm stop X` → `vm start X`, the store # reports `state=running` but `vm ssh X` returns `not_running` # because the daemon's `vmAlive(vm)` check returns false. Seen # reliably on Debian-bookworm default image. # # This script: # 1. Builds instrumented binaries (reuses $(SMOKE_BIN_DIR)) # 2. Points banger at an isolated XDG so it doesn't touch the # invoking user's real install # 3. Runs the create→stop→start sequence # 4. Asserts `vm ssh -- true` works post-restart # 5. On failure, dumps the daemon log (vm.start trace), firecracker # log (guest kernel output), pgrep state (is firecracker # actually running?), api-sock presence, and handles.json # # Exit 0 = bug is fixed. Exit 1 = bug still reproduces. # # Run directly (builds binaries on demand): # ./scripts/repro-restart-bug.sh set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" cd "$REPO_ROOT" log() { printf '[repro] %s\n' "$*" >&2; } die() { printf '[repro] FAIL: %s\n' "$*" >&2; exit 1; } # Reuse smoke binaries if present; otherwise build them. They're # instrumented with -cover, but that's harmless for this test. make smoke-build >/dev/null BIN_DIR="$REPO_ROOT/build/smoke/bin" for bin in banger bangerd banger-vsock-agent; do [[ -x "$BIN_DIR/$bin" ]] || die "missing $BIN_DIR/$bin; run make smoke-build" done BANGER="$BIN_DIR/banger" VMNAME=repro-restart # Isolated XDG root, torn down at exit. Unlike smoke.sh we do NOT # persist across runs — we want a clean slate so the very first # image pull also exercises the second-start path. WORKDIR="$(mktemp -d -t banger-repro-XXXXXX)" trap 'rm -rf "$WORKDIR"' EXIT export XDG_CONFIG_HOME="$WORKDIR/config" export XDG_STATE_HOME="$WORKDIR/state" export XDG_CACHE_HOME="$WORKDIR/cache" export XDG_RUNTIME_DIR="$WORKDIR/runtime" mkdir -p "$XDG_CONFIG_HOME" "$XDG_STATE_HOME" "$XDG_CACHE_HOME" "$XDG_RUNTIME_DIR" chmod 0700 "$XDG_RUNTIME_DIR" export BANGER_DAEMON_BIN="$BIN_DIR/bangerd" export BANGER_VSOCK_AGENT_BIN="$BIN_DIR/banger-vsock-agent" export GOCOVERDIR="$WORKDIR/covdata" mkdir -p "$GOCOVERDIR" # Refuse to run if the user's real daemon has :42069 bound — we'd # fail for the wrong reason. if command -v ss >/dev/null 2>&1 && ss -Huln 2>/dev/null | awk '{print $4}' | grep -q '[:.]42069$'; then die 'port 127.0.0.1:42069 already bound; stop your real banger daemon first' fi "$BANGER" daemon stop >/dev/null 2>&1 || true LOG_PATH="$XDG_STATE_HOME/banger/bangerd.log" diag() { printf '\n[repro] === DIAGNOSTICS ===\n' >&2 printf '[repro] vm show:\n' >&2 "$BANGER" vm show "$VMNAME" >&2 2>/dev/null || true local vmdir vmdir="$("$BANGER" vm show "$VMNAME" 2>/dev/null | awk -F'"' '/"vm_dir"/ {print $4}')" local apisock apisock="$("$BANGER" vm show "$VMNAME" 2>/dev/null | awk -F'"' '/"api_sock_path"/ {print $4}')" printf '\n[repro] handles.json:\n' >&2 [[ -n "$vmdir" && -f "$vmdir/handles.json" ]] && cat "$vmdir/handles.json" >&2 || echo ' (missing)' >&2 printf '\n[repro] pgrep by apiSock (%s):\n' "$apisock" >&2 [[ -n "$apisock" ]] && (pgrep -af "$apisock" >&2 || echo ' (none)' >&2) || echo ' (no apisock)' >&2 printf '\n[repro] apiSock present:\n' >&2 if [[ -n "$apisock" ]]; then if [[ -S "$apisock" ]]; then sudo -n ls -la "$apisock" >&2 2>/dev/null || ls -la "$apisock" >&2 2>/dev/null || echo ' (cannot stat)' >&2 else echo ' NOT PRESENT' >&2 fi fi printf '\n[repro] daemon log — last vm.start trace:\n' >&2 if [[ -f "$LOG_PATH" ]]; then # Dump everything from the most recent "operation started" for vm.start. awk ' /"operation":"vm\.start"/ && /"msg":"operation started"/ { lastStart=NR } { lines[NR]=$0 } END { if (lastStart) for (i=lastStart; i<=NR; i++) print lines[i] } ' "$LOG_PATH" | tail -40 >&2 else echo ' (daemon log missing)' >&2 fi printf '\n[repro] firecracker.log tail (guest kernel output):\n' >&2 [[ -n "$vmdir" && -f "$vmdir/firecracker.log" ]] && tail -30 "$vmdir/firecracker.log" >&2 || echo ' (missing)' >&2 printf '\n' >&2 } log "create $VMNAME" "$BANGER" vm create --name "$VMNAME" >/dev/null || die "create failed" log 'wait for initial ssh' deadline=$(( $(date +%s) + 90 )) while (( $(date +%s) < deadline )); do "$BANGER" vm ssh "$VMNAME" -- true >/dev/null 2>&1 && break sleep 1 done "$BANGER" vm ssh "$VMNAME" -- true >/dev/null 2>&1 || { diag; die 'initial ssh never came up'; } log 'stop' "$BANGER" vm stop "$VMNAME" >/dev/null || { diag; die 'stop failed'; } log 'start (this is where the bug manifests)' "$BANGER" vm start "$VMNAME" >/dev/null || { diag; die 'start failed'; } log 'assert vm ssh succeeds post-restart (60s budget)' deadline=$(( $(date +%s) + 60 )) while (( $(date +%s) < deadline )); do if "$BANGER" vm ssh "$VMNAME" -- true >/dev/null 2>&1; then log 'PASS — bug appears fixed' "$BANGER" vm delete "$VMNAME" >/dev/null 2>&1 || true "$BANGER" daemon stop >/dev/null 2>&1 || true exit 0 fi sleep 1 done log 'FAIL — vm ssh never succeeded post-restart' diag "$BANGER" vm delete "$VMNAME" >/dev/null 2>&1 || true "$BANGER" daemon stop >/dev/null 2>&1 || true exit 1