Add regression coverage for VM failure paths
Dangerous lifecycle, store, system, and RPC paths still had little or no automated confidence, and the live smoke harness failed opaquely when guest boot timing drifted. This adds targeted unit coverage for store allocation and decode failures, system helper failure ordering and cleanup, RPC error handling, and daemon lookup/reconcile/editing/stats/preflight edge cases. It also makes verify.sh wait for daemon-observable VM readiness before SSH, reuse a bounded boot deadline for the SSH phase, and dump VM metadata, logs, tap state, socket state, and NAT rules on timeout so host-level failures are diagnosable instead of surfacing only connection refused. Validation: go test ./..., go test ./... -cover, bash -n verify.sh. No live ./verify.sh boot was run in this environment.
This commit is contained in:
parent
fcedacba5c
commit
5018bc6170
5 changed files with 1229 additions and 16 deletions
88
verify.sh
88
verify.sh
|
|
@ -24,7 +24,7 @@ fi
|
|||
|
||||
wait_for_ssh() {
|
||||
local guest_ip="$1"
|
||||
local deadline=$((SECONDS + 60))
|
||||
local deadline="$2"
|
||||
|
||||
while ((SECONDS < deadline)); do
|
||||
if ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
|
||||
|
|
@ -37,6 +37,62 @@ wait_for_ssh() {
|
|||
return 1
|
||||
}
|
||||
|
||||
refresh_vm_metadata() {
|
||||
if ! VM_JSON="$(./banger vm show "$VM_NAME" 2>/dev/null)"; then
|
||||
return 1
|
||||
fi
|
||||
TAP="$(printf '%s\n' "$VM_JSON" | jq -r '.runtime.tap_device // empty')"
|
||||
VM_DIR="$(printf '%s\n' "$VM_JSON" | jq -r '.runtime.vm_dir // empty')"
|
||||
GUEST_IP="$(printf '%s\n' "$VM_JSON" | jq -r '.runtime.guest_ip // empty')"
|
||||
API_SOCK="$(printf '%s\n' "$VM_JSON" | jq -r '.runtime.api_sock_path // empty')"
|
||||
PID="$(printf '%s\n' "$VM_JSON" | jq -r '.runtime.pid // 0')"
|
||||
VM_STATE="$(printf '%s\n' "$VM_JSON" | jq -r '.state // empty')"
|
||||
LAST_ERROR="$(printf '%s\n' "$VM_JSON" | jq -r '.runtime.last_error // empty')"
|
||||
return 0
|
||||
}
|
||||
|
||||
wait_for_vm_ready() {
|
||||
local deadline="$1"
|
||||
|
||||
while ((SECONDS < deadline)); do
|
||||
if ! refresh_vm_metadata; then
|
||||
sleep 1
|
||||
continue
|
||||
fi
|
||||
if [[ "$VM_STATE" == "error" || -n "$LAST_ERROR" ]]; then
|
||||
return 2
|
||||
fi
|
||||
if [[ "$VM_STATE" == "running" && -n "$GUEST_IP" && -n "$TAP" && -n "$VM_DIR" && -n "$API_SOCK" && "${PID:-0}" -gt 0 ]]; then
|
||||
if [[ -S "$API_SOCK" ]] && ip link show "$TAP" >/dev/null 2>&1; then
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
|
||||
return 1
|
||||
}
|
||||
|
||||
dump_diagnostics() {
|
||||
log "diagnostics for $VM_NAME"
|
||||
./banger vm show "$VM_NAME" || true
|
||||
log "recent firecracker log"
|
||||
./banger vm logs "$VM_NAME" 2>/dev/null | tail -n 200 || true
|
||||
if [[ -n "${TAP:-}" ]]; then
|
||||
log "tap state for $TAP"
|
||||
ip link show "$TAP" || true
|
||||
fi
|
||||
if [[ -n "${API_SOCK:-}" ]]; then
|
||||
log "api socket $API_SOCK"
|
||||
ls -l "$API_SOCK" 2>/dev/null || true
|
||||
fi
|
||||
if (( NAT_ENABLED )) && [[ -n "${UPLINK:-}" && -n "${GUEST_IP:-}" && -n "${TAP:-}" ]]; then
|
||||
log "nat rules for ${GUEST_IP} via ${UPLINK}"
|
||||
sudo iptables -t nat -S POSTROUTING | grep "${GUEST_IP}/32" || true
|
||||
sudo iptables -S FORWARD | grep "$TAP" || true
|
||||
fi
|
||||
}
|
||||
|
||||
usage() {
|
||||
cat <<'EOF'
|
||||
Usage: ./verify.sh [--nat]
|
||||
|
|
@ -47,6 +103,7 @@ EOF
|
|||
}
|
||||
|
||||
NAT_ENABLED=0
|
||||
BOOT_TIMEOUT_SECS="${VERIFY_BOOT_TIMEOUT_SECS:-90}"
|
||||
if [[ "${1:-}" == "--nat" ]]; then
|
||||
NAT_ENABLED=1
|
||||
shift
|
||||
|
|
@ -62,6 +119,10 @@ TAP=""
|
|||
VM_DIR=""
|
||||
GUEST_IP=""
|
||||
UPLINK=""
|
||||
API_SOCK=""
|
||||
PID="0"
|
||||
VM_STATE=""
|
||||
LAST_ERROR=""
|
||||
|
||||
cleanup() {
|
||||
if [[ -n "${VM_NAME:-}" ]]; then
|
||||
|
|
@ -78,21 +139,15 @@ if (( NAT_ENABLED )); then
|
|||
fi
|
||||
"${CREATE_ARGS[@]}" >/dev/null
|
||||
|
||||
VM_JSON="$(./banger vm show "$VM_NAME")"
|
||||
name="$(printf '%s\n' "$VM_JSON" | jq -r '.name // empty')"
|
||||
guest_ip="$(printf '%s\n' "$VM_JSON" | jq -r '.runtime.guest_ip // empty')"
|
||||
tap="$(printf '%s\n' "$VM_JSON" | jq -r '.runtime.tap_device // empty')"
|
||||
vm_dir="$(printf '%s\n' "$VM_JSON" | jq -r '.runtime.vm_dir // empty')"
|
||||
BOOT_DEADLINE=$((SECONDS + BOOT_TIMEOUT_SECS))
|
||||
|
||||
if [[ -z "$name" || -z "$guest_ip" || -z "$tap" || -z "$vm_dir" ]]; then
|
||||
log "missing VM metadata from banger vm show"
|
||||
log "waiting for VM runtime readiness"
|
||||
if ! wait_for_vm_ready "$BOOT_DEADLINE"; then
|
||||
log "vm did not become ready before timeout"
|
||||
dump_diagnostics
|
||||
exit 1
|
||||
fi
|
||||
|
||||
TAP="$tap"
|
||||
VM_DIR="$vm_dir"
|
||||
GUEST_IP="$guest_ip"
|
||||
|
||||
if (( NAT_ENABLED )); then
|
||||
UPLINK="$(ip route show default 2>/dev/null | awk '/default/ {print $5; exit}')"
|
||||
if [[ -z "$UPLINK" ]]; then
|
||||
|
|
@ -106,17 +161,18 @@ if (( NAT_ENABLED )); then
|
|||
fi
|
||||
|
||||
log "asserting VM is reachable via SSH"
|
||||
if ! wait_for_ssh "$guest_ip"; then
|
||||
log "ssh did not become ready for ${guest_ip}"
|
||||
if ! wait_for_ssh "$GUEST_IP" "$BOOT_DEADLINE"; then
|
||||
log "ssh did not become ready for ${GUEST_IP}"
|
||||
dump_diagnostics
|
||||
exit 1
|
||||
fi
|
||||
ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
|
||||
"root@${guest_ip}" "uname -a" >/dev/null
|
||||
"root@${GUEST_IP}" "uname -a" >/dev/null
|
||||
|
||||
if (( NAT_ENABLED )); then
|
||||
log "asserting VM has outbound network access"
|
||||
ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
|
||||
"root@${guest_ip}" "curl -fsS https://example.com >/dev/null" >/dev/null
|
||||
"root@${GUEST_IP}" "curl -fsS https://example.com >/dev/null" >/dev/null
|
||||
fi
|
||||
|
||||
log "cleaning up VM"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue