Fix VM lifecycle issues behind verify.sh

Make the Firecracker and bangerd processes outlive short-lived CLI request contexts so vm create no longer kills the VMM or daemon as soon as the RPC returns.

Fix fresh-VM SSH by flattening the seeded /root work disk when the copied home tree lands under a nested root/ directory, and write a guest sshd override to keep root pubkey auth explicit while debugging.

Harden teardown and smoke diagnostics: verify.sh now reports early Firecracker exit and delete failures directly, while dm snapshot cleanup tolerates already-gone handles and retries busy mapper removal long enough for Firecracker to release the device.

Validation: go test ./..., make build, bash -n verify.sh, direct SSH against a fresh VM, and a live ./verify.sh run that now completes with [verify] ok.
This commit is contained in:
Thales Maciel 2026-03-17 14:43:09 -03:00
parent 617f677c9b
commit 60294e8c90
No known key found for this signature in database
GPG key ID: 33112E6833C34679
7 changed files with 149 additions and 21 deletions

View file

@ -21,6 +21,22 @@ if [[ ! -f "$SSH_KEY" ]]; then
log "ssh key not found: $SSH_KEY"
exit 1
fi
DAEMON_LOG="${XDG_STATE_HOME:-$HOME/.local/state}/banger/bangerd.log"
firecracker_running() {
local pid="$1"
local api_sock="$2"
local cmdline=""
if [[ -z "$pid" || "$pid" -le 0 || -z "$api_sock" ]]; then
return 1
fi
if [[ ! -r "/proc/$pid/cmdline" ]]; then
return 1
fi
cmdline="$(cat "/proc/$pid/cmdline" 2>/dev/null | tr '\0' ' ' || true)"
[[ "$cmdline" == *firecracker* && "$cmdline" == *"$api_sock"* ]]
}
wait_for_ssh() {
local guest_ip="$1"
@ -62,6 +78,9 @@ wait_for_vm_ready() {
if [[ "$VM_STATE" == "error" || -n "$LAST_ERROR" ]]; then
return 2
fi
if [[ -n "$API_SOCK" && "${PID:-0}" -gt 0 ]] && ! firecracker_running "$PID" "$API_SOCK"; then
return 3
fi
if [[ "$VM_STATE" == "running" && -n "$GUEST_IP" && -n "$TAP" && -n "$VM_DIR" && -n "$API_SOCK" && "${PID:-0}" -gt 0 ]]; then
if [[ -S "$API_SOCK" ]] && ip link show "$TAP" >/dev/null 2>&1; then
return 0
@ -76,8 +95,16 @@ wait_for_vm_ready() {
dump_diagnostics() {
log "diagnostics for $VM_NAME"
./banger vm show "$VM_NAME" || true
if [[ "${PID:-0}" -gt 0 ]]; then
log "process state for pid $PID"
ps -fp "$PID" || true
fi
log "recent firecracker log"
./banger vm logs "$VM_NAME" 2>/dev/null | tail -n 200 || true
if [[ -f "$DAEMON_LOG" ]]; then
log "recent daemon log"
tail -n 200 "$DAEMON_LOG" || true
fi
if [[ -n "${TAP:-}" ]]; then
log "tap state for $TAP"
ip link show "$TAP" || true
@ -124,6 +151,12 @@ PID="0"
VM_STATE=""
LAST_ERROR=""
delete_vm() {
if [[ -n "${VM_NAME:-}" ]]; then
./banger vm delete "$VM_NAME"
fi
}
cleanup() {
if [[ -n "${VM_NAME:-}" ]]; then
./banger vm delete "$VM_NAME" >/dev/null 2>&1 || true
@ -142,8 +175,15 @@ fi
BOOT_DEADLINE=$((SECONDS + BOOT_TIMEOUT_SECS))
log "waiting for VM runtime readiness"
if ! wait_for_vm_ready "$BOOT_DEADLINE"; then
log "vm did not become ready before timeout"
if wait_for_vm_ready "$BOOT_DEADLINE"; then
:
else
status=$?
case "$status" in
2) log "vm entered an error state before becoming ready" ;;
3) log "firecracker exited before the guest became ready" ;;
*) log "vm did not become ready before timeout" ;;
esac
dump_diagnostics
exit 1
fi
@ -176,7 +216,11 @@ if (( NAT_ENABLED )); then
fi
log "cleaning up VM"
cleanup
if ! delete_vm; then
log "vm delete failed for $VM_NAME"
dump_diagnostics
exit 1
fi
log "asserting cleanup success"
if ./banger vm show "$VM_NAME" >/dev/null 2>&1; then