Add regression coverage for VM failure paths

Dangerous lifecycle, store, system, and RPC paths still had little or no automated confidence, and the live smoke harness failed opaquely when guest boot timing drifted. This adds targeted unit coverage for store allocation and decode failures, system helper failure ordering and cleanup, RPC error handling, and daemon lookup/reconcile/editing/stats/preflight edge cases.

It also makes verify.sh wait for daemon-observable VM readiness before SSH, reuse a bounded boot deadline for the SSH phase, and dump VM metadata, logs, tap state, socket state, and NAT rules on timeout so host-level failures are diagnosable instead of surfacing only connection refused.

Validation: go test ./..., go test ./... -cover, bash -n verify.sh. No live ./verify.sh boot was run in this environment.
This commit is contained in:
Thales Maciel 2026-03-16 15:46:54 -03:00
parent fcedacba5c
commit 5018bc6170
No known key found for this signature in database
GPG key ID: 33112E6833C34679
5 changed files with 1229 additions and 16 deletions

View file

@ -24,7 +24,7 @@ fi
wait_for_ssh() {
local guest_ip="$1"
local deadline=$((SECONDS + 60))
local deadline="$2"
while ((SECONDS < deadline)); do
if ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
@ -37,6 +37,62 @@ wait_for_ssh() {
return 1
}
refresh_vm_metadata() {
if ! VM_JSON="$(./banger vm show "$VM_NAME" 2>/dev/null)"; then
return 1
fi
TAP="$(printf '%s\n' "$VM_JSON" | jq -r '.runtime.tap_device // empty')"
VM_DIR="$(printf '%s\n' "$VM_JSON" | jq -r '.runtime.vm_dir // empty')"
GUEST_IP="$(printf '%s\n' "$VM_JSON" | jq -r '.runtime.guest_ip // empty')"
API_SOCK="$(printf '%s\n' "$VM_JSON" | jq -r '.runtime.api_sock_path // empty')"
PID="$(printf '%s\n' "$VM_JSON" | jq -r '.runtime.pid // 0')"
VM_STATE="$(printf '%s\n' "$VM_JSON" | jq -r '.state // empty')"
LAST_ERROR="$(printf '%s\n' "$VM_JSON" | jq -r '.runtime.last_error // empty')"
return 0
}
wait_for_vm_ready() {
local deadline="$1"
while ((SECONDS < deadline)); do
if ! refresh_vm_metadata; then
sleep 1
continue
fi
if [[ "$VM_STATE" == "error" || -n "$LAST_ERROR" ]]; then
return 2
fi
if [[ "$VM_STATE" == "running" && -n "$GUEST_IP" && -n "$TAP" && -n "$VM_DIR" && -n "$API_SOCK" && "${PID:-0}" -gt 0 ]]; then
if [[ -S "$API_SOCK" ]] && ip link show "$TAP" >/dev/null 2>&1; then
return 0
fi
fi
sleep 1
done
return 1
}
dump_diagnostics() {
log "diagnostics for $VM_NAME"
./banger vm show "$VM_NAME" || true
log "recent firecracker log"
./banger vm logs "$VM_NAME" 2>/dev/null | tail -n 200 || true
if [[ -n "${TAP:-}" ]]; then
log "tap state for $TAP"
ip link show "$TAP" || true
fi
if [[ -n "${API_SOCK:-}" ]]; then
log "api socket $API_SOCK"
ls -l "$API_SOCK" 2>/dev/null || true
fi
if (( NAT_ENABLED )) && [[ -n "${UPLINK:-}" && -n "${GUEST_IP:-}" && -n "${TAP:-}" ]]; then
log "nat rules for ${GUEST_IP} via ${UPLINK}"
sudo iptables -t nat -S POSTROUTING | grep "${GUEST_IP}/32" || true
sudo iptables -S FORWARD | grep "$TAP" || true
fi
}
usage() {
cat <<'EOF'
Usage: ./verify.sh [--nat]
@ -47,6 +103,7 @@ EOF
}
NAT_ENABLED=0
BOOT_TIMEOUT_SECS="${VERIFY_BOOT_TIMEOUT_SECS:-90}"
if [[ "${1:-}" == "--nat" ]]; then
NAT_ENABLED=1
shift
@ -62,6 +119,10 @@ TAP=""
VM_DIR=""
GUEST_IP=""
UPLINK=""
API_SOCK=""
PID="0"
VM_STATE=""
LAST_ERROR=""
cleanup() {
if [[ -n "${VM_NAME:-}" ]]; then
@ -78,21 +139,15 @@ if (( NAT_ENABLED )); then
fi
"${CREATE_ARGS[@]}" >/dev/null
VM_JSON="$(./banger vm show "$VM_NAME")"
name="$(printf '%s\n' "$VM_JSON" | jq -r '.name // empty')"
guest_ip="$(printf '%s\n' "$VM_JSON" | jq -r '.runtime.guest_ip // empty')"
tap="$(printf '%s\n' "$VM_JSON" | jq -r '.runtime.tap_device // empty')"
vm_dir="$(printf '%s\n' "$VM_JSON" | jq -r '.runtime.vm_dir // empty')"
BOOT_DEADLINE=$((SECONDS + BOOT_TIMEOUT_SECS))
if [[ -z "$name" || -z "$guest_ip" || -z "$tap" || -z "$vm_dir" ]]; then
log "missing VM metadata from banger vm show"
log "waiting for VM runtime readiness"
if ! wait_for_vm_ready "$BOOT_DEADLINE"; then
log "vm did not become ready before timeout"
dump_diagnostics
exit 1
fi
TAP="$tap"
VM_DIR="$vm_dir"
GUEST_IP="$guest_ip"
if (( NAT_ENABLED )); then
UPLINK="$(ip route show default 2>/dev/null | awk '/default/ {print $5; exit}')"
if [[ -z "$UPLINK" ]]; then
@ -106,17 +161,18 @@ if (( NAT_ENABLED )); then
fi
log "asserting VM is reachable via SSH"
if ! wait_for_ssh "$guest_ip"; then
log "ssh did not become ready for ${guest_ip}"
if ! wait_for_ssh "$GUEST_IP" "$BOOT_DEADLINE"; then
log "ssh did not become ready for ${GUEST_IP}"
dump_diagnostics
exit 1
fi
ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
"root@${guest_ip}" "uname -a" >/dev/null
"root@${GUEST_IP}" "uname -a" >/dev/null
if (( NAT_ENABLED )); then
log "asserting VM has outbound network access"
ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
"root@${guest_ip}" "curl -fsS https://example.com >/dev/null" >/dev/null
"root@${GUEST_IP}" "curl -fsS https://example.com >/dev/null" >/dev/null
fi
log "cleaning up VM"