banger/verify.sh
Thales Maciel 5018bc6170
Add regression coverage for VM failure paths
Dangerous lifecycle, store, system, and RPC paths still had little or no automated confidence, and the live smoke harness failed opaquely when guest boot timing drifted. This adds targeted unit coverage for store allocation and decode failures, system helper failure ordering and cleanup, RPC error handling, and daemon lookup/reconcile/editing/stats/preflight edge cases.

It also makes verify.sh wait for daemon-observable VM readiness before SSH, reuse a bounded boot deadline for the SSH phase, and dump VM metadata, logs, tap state, socket state, and NAT rules on timeout so host-level failures are diagnosable instead of surfacing only connection refused.

Validation: go test ./..., go test ./... -cover, bash -n verify.sh. No live ./verify.sh boot was run in this environment.
2026-03-16 15:46:54 -03:00

209 lines
5.4 KiB
Bash
Executable file

#!/usr/bin/env bash
set -euo pipefail
log() {
printf '[verify] %s\n' "$*"
}
DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
DEFAULT_RUNTIME_DIR="$DIR"
if [[ -d "$DIR/runtime" ]]; then
DEFAULT_RUNTIME_DIR="$DIR/runtime"
fi
RUNTIME_DIR="${BANGER_RUNTIME_DIR:-$DEFAULT_RUNTIME_DIR}"
SSH_KEY="$RUNTIME_DIR/id_ed25519"
if [[ ! -d "$RUNTIME_DIR" ]]; then
log "runtime bundle not found: $RUNTIME_DIR"
log "run 'make runtime-bundle' or set BANGER_RUNTIME_DIR"
exit 1
fi
if [[ ! -f "$SSH_KEY" ]]; then
log "ssh key not found: $SSH_KEY"
exit 1
fi
wait_for_ssh() {
local guest_ip="$1"
local deadline="$2"
while ((SECONDS < deadline)); do
if ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
-o ConnectTimeout=2 "root@${guest_ip}" "true" >/dev/null 2>&1; then
return 0
fi
sleep 1
done
return 1
}
refresh_vm_metadata() {
if ! VM_JSON="$(./banger vm show "$VM_NAME" 2>/dev/null)"; then
return 1
fi
TAP="$(printf '%s\n' "$VM_JSON" | jq -r '.runtime.tap_device // empty')"
VM_DIR="$(printf '%s\n' "$VM_JSON" | jq -r '.runtime.vm_dir // empty')"
GUEST_IP="$(printf '%s\n' "$VM_JSON" | jq -r '.runtime.guest_ip // empty')"
API_SOCK="$(printf '%s\n' "$VM_JSON" | jq -r '.runtime.api_sock_path // empty')"
PID="$(printf '%s\n' "$VM_JSON" | jq -r '.runtime.pid // 0')"
VM_STATE="$(printf '%s\n' "$VM_JSON" | jq -r '.state // empty')"
LAST_ERROR="$(printf '%s\n' "$VM_JSON" | jq -r '.runtime.last_error // empty')"
return 0
}
wait_for_vm_ready() {
local deadline="$1"
while ((SECONDS < deadline)); do
if ! refresh_vm_metadata; then
sleep 1
continue
fi
if [[ "$VM_STATE" == "error" || -n "$LAST_ERROR" ]]; then
return 2
fi
if [[ "$VM_STATE" == "running" && -n "$GUEST_IP" && -n "$TAP" && -n "$VM_DIR" && -n "$API_SOCK" && "${PID:-0}" -gt 0 ]]; then
if [[ -S "$API_SOCK" ]] && ip link show "$TAP" >/dev/null 2>&1; then
return 0
fi
fi
sleep 1
done
return 1
}
dump_diagnostics() {
log "diagnostics for $VM_NAME"
./banger vm show "$VM_NAME" || true
log "recent firecracker log"
./banger vm logs "$VM_NAME" 2>/dev/null | tail -n 200 || true
if [[ -n "${TAP:-}" ]]; then
log "tap state for $TAP"
ip link show "$TAP" || true
fi
if [[ -n "${API_SOCK:-}" ]]; then
log "api socket $API_SOCK"
ls -l "$API_SOCK" 2>/dev/null || true
fi
if (( NAT_ENABLED )) && [[ -n "${UPLINK:-}" && -n "${GUEST_IP:-}" && -n "${TAP:-}" ]]; then
log "nat rules for ${GUEST_IP} via ${UPLINK}"
sudo iptables -t nat -S POSTROUTING | grep "${GUEST_IP}/32" || true
sudo iptables -S FORWARD | grep "$TAP" || true
fi
}
usage() {
cat <<'EOF'
Usage: ./verify.sh [--nat]
Run a basic smoke test for the Go VM workflow.
Use --nat to additionally verify outbound NAT and host rule cleanup.
EOF
}
NAT_ENABLED=0
BOOT_TIMEOUT_SECS="${VERIFY_BOOT_TIMEOUT_SECS:-90}"
if [[ "${1:-}" == "--nat" ]]; then
NAT_ENABLED=1
shift
fi
if (($# != 0)); then
usage
exit 1
fi
VM_NAME="verify-$(date +%s)"
VM_JSON=""
TAP=""
VM_DIR=""
GUEST_IP=""
UPLINK=""
API_SOCK=""
PID="0"
VM_STATE=""
LAST_ERROR=""
cleanup() {
if [[ -n "${VM_NAME:-}" ]]; then
./banger vm delete "$VM_NAME" >/dev/null 2>&1 || true
fi
}
trap cleanup EXIT
log "starting VM"
CREATE_ARGS=(./banger vm create --name "$VM_NAME")
if (( NAT_ENABLED )); then
CREATE_ARGS+=(--nat)
fi
"${CREATE_ARGS[@]}" >/dev/null
BOOT_DEADLINE=$((SECONDS + BOOT_TIMEOUT_SECS))
log "waiting for VM runtime readiness"
if ! wait_for_vm_ready "$BOOT_DEADLINE"; then
log "vm did not become ready before timeout"
dump_diagnostics
exit 1
fi
if (( NAT_ENABLED )); then
UPLINK="$(ip route show default 2>/dev/null | awk '/default/ {print $5; exit}')"
if [[ -z "$UPLINK" ]]; then
log "failed to detect uplink interface"
exit 1
fi
log "asserting NAT rules are installed"
sudo iptables -t nat -C POSTROUTING -s "${GUEST_IP}/32" -o "$UPLINK" -j MASQUERADE
sudo iptables -C FORWARD -i "$TAP" -o "$UPLINK" -j ACCEPT
sudo iptables -C FORWARD -i "$UPLINK" -o "$TAP" -m state --state RELATED,ESTABLISHED -j ACCEPT
fi
log "asserting VM is reachable via SSH"
if ! wait_for_ssh "$GUEST_IP" "$BOOT_DEADLINE"; then
log "ssh did not become ready for ${GUEST_IP}"
dump_diagnostics
exit 1
fi
ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
"root@${GUEST_IP}" "uname -a" >/dev/null
if (( NAT_ENABLED )); then
log "asserting VM has outbound network access"
ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
"root@${GUEST_IP}" "curl -fsS https://example.com >/dev/null" >/dev/null
fi
log "cleaning up VM"
cleanup
log "asserting cleanup success"
if ./banger vm show "$VM_NAME" >/dev/null 2>&1; then
log "vm still exists after delete: $VM_NAME"
exit 1
fi
if ip link show "$TAP" >/dev/null 2>&1; then
log "tap still exists: $TAP"
exit 1
fi
if [[ -d "$VM_DIR" ]]; then
log "vm dir still exists: $VM_DIR"
exit 1
fi
if (( NAT_ENABLED )); then
if sudo iptables -t nat -C POSTROUTING -s "${GUEST_IP}/32" -o "$UPLINK" -j MASQUERADE 2>/dev/null; then
log "nat rule still exists for ${GUEST_IP}"
exit 1
fi
if sudo iptables -C FORWARD -i "$TAP" -o "$UPLINK" -j ACCEPT 2>/dev/null; then
log "forward-out rule still exists for ${TAP}"
exit 1
fi
if sudo iptables -C FORWARD -i "$UPLINK" -o "$TAP" -m state --state RELATED,ESTABLISHED -j ACCEPT 2>/dev/null; then
log "forward-in rule still exists for ${TAP}"
exit 1
fi
fi
log "ok"