Beat VM create wall time without changing VM semantics. Generate a work-seed ext4 sidecar during image builds and rootfs rebuilds, then clone and resize that seed for each new VM instead of rebuilding /root from scratch. Plumb the new seed artifact through config, runtime metadata, store state, runtime-bundle defaults, doctor checks, and default-image reconciliation so older images still fall back cleanly. Add a daemon TAP pool to keep idle bridge-attached devices warm, expose stage timing in lifecycle logs, add a create/SSH benchmark script plus Make target, and teach verify.sh that tap-pool-* devices are reusable capacity rather than cleanup leaks. Validated with go test ./..., make build, ./verify.sh, and make bench-create ARGS="--runs 2".
262 lines
6.7 KiB
Bash
Executable file
262 lines
6.7 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
log() {
|
|
printf '[verify] %s\n' "$*"
|
|
}
|
|
|
|
DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
DEFAULT_RUNTIME_DIR="$DIR"
|
|
if [[ -d "$DIR/runtime" ]]; then
|
|
DEFAULT_RUNTIME_DIR="$DIR/runtime"
|
|
fi
|
|
RUNTIME_DIR="${BANGER_RUNTIME_DIR:-$DEFAULT_RUNTIME_DIR}"
|
|
SSH_KEY="$RUNTIME_DIR/id_ed25519"
|
|
if [[ ! -d "$RUNTIME_DIR" ]]; then
|
|
log "runtime bundle not found: $RUNTIME_DIR"
|
|
log "run 'make runtime-bundle' or set BANGER_RUNTIME_DIR"
|
|
exit 1
|
|
fi
|
|
if [[ ! -f "$SSH_KEY" ]]; then
|
|
log "ssh key not found: $SSH_KEY"
|
|
exit 1
|
|
fi
|
|
DAEMON_LOG="${XDG_STATE_HOME:-$HOME/.local/state}/banger/bangerd.log"
|
|
|
|
firecracker_running() {
|
|
local pid="$1"
|
|
local api_sock="$2"
|
|
local cmdline=""
|
|
|
|
if [[ -z "$pid" || "$pid" -le 0 || -z "$api_sock" ]]; then
|
|
return 1
|
|
fi
|
|
if [[ ! -r "/proc/$pid/cmdline" ]]; then
|
|
return 1
|
|
fi
|
|
cmdline="$(cat "/proc/$pid/cmdline" 2>/dev/null | tr '\0' ' ' || true)"
|
|
[[ "$cmdline" == *firecracker* && "$cmdline" == *"$api_sock"* ]]
|
|
}
|
|
|
|
pooled_tap() {
|
|
local tap="$1"
|
|
[[ "$tap" == tap-pool-* ]]
|
|
}
|
|
|
|
wait_for_ssh() {
|
|
local guest_ip="$1"
|
|
local deadline="$2"
|
|
|
|
while ((SECONDS < deadline)); do
|
|
if ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
|
|
-o ConnectTimeout=2 "root@${guest_ip}" "true" >/dev/null 2>&1; then
|
|
return 0
|
|
fi
|
|
sleep 1
|
|
done
|
|
|
|
return 1
|
|
}
|
|
|
|
refresh_vm_metadata() {
|
|
if ! VM_JSON="$(./banger vm show "$VM_NAME" 2>/dev/null)"; then
|
|
return 1
|
|
fi
|
|
TAP="$(printf '%s\n' "$VM_JSON" | jq -r '.runtime.tap_device // empty')"
|
|
VM_DIR="$(printf '%s\n' "$VM_JSON" | jq -r '.runtime.vm_dir // empty')"
|
|
GUEST_IP="$(printf '%s\n' "$VM_JSON" | jq -r '.runtime.guest_ip // empty')"
|
|
API_SOCK="$(printf '%s\n' "$VM_JSON" | jq -r '.runtime.api_sock_path // empty')"
|
|
PID="$(printf '%s\n' "$VM_JSON" | jq -r '.runtime.pid // 0')"
|
|
VM_STATE="$(printf '%s\n' "$VM_JSON" | jq -r '.state // empty')"
|
|
LAST_ERROR="$(printf '%s\n' "$VM_JSON" | jq -r '.runtime.last_error // empty')"
|
|
return 0
|
|
}
|
|
|
|
wait_for_vm_ready() {
|
|
local deadline="$1"
|
|
|
|
while ((SECONDS < deadline)); do
|
|
if ! refresh_vm_metadata; then
|
|
sleep 1
|
|
continue
|
|
fi
|
|
if [[ "$VM_STATE" == "error" || -n "$LAST_ERROR" ]]; then
|
|
return 2
|
|
fi
|
|
if [[ -n "$API_SOCK" && "${PID:-0}" -gt 0 ]] && ! firecracker_running "$PID" "$API_SOCK"; then
|
|
return 3
|
|
fi
|
|
if [[ "$VM_STATE" == "running" && -n "$GUEST_IP" && -n "$TAP" && -n "$VM_DIR" && -n "$API_SOCK" && "${PID:-0}" -gt 0 ]]; then
|
|
if [[ -S "$API_SOCK" ]] && ip link show "$TAP" >/dev/null 2>&1; then
|
|
return 0
|
|
fi
|
|
fi
|
|
sleep 1
|
|
done
|
|
|
|
return 1
|
|
}
|
|
|
|
dump_diagnostics() {
|
|
log "diagnostics for $VM_NAME"
|
|
./banger vm show "$VM_NAME" || true
|
|
if [[ "${PID:-0}" -gt 0 ]]; then
|
|
log "process state for pid $PID"
|
|
ps -fp "$PID" || true
|
|
fi
|
|
log "recent firecracker log"
|
|
./banger vm logs "$VM_NAME" 2>/dev/null | tail -n 200 || true
|
|
if [[ -f "$DAEMON_LOG" ]]; then
|
|
log "recent daemon log"
|
|
tail -n 200 "$DAEMON_LOG" || true
|
|
fi
|
|
if [[ -n "${TAP:-}" ]]; then
|
|
log "tap state for $TAP"
|
|
ip link show "$TAP" || true
|
|
fi
|
|
if [[ -n "${API_SOCK:-}" ]]; then
|
|
log "api socket $API_SOCK"
|
|
ls -l "$API_SOCK" 2>/dev/null || true
|
|
fi
|
|
if (( NAT_ENABLED )) && [[ -n "${UPLINK:-}" && -n "${GUEST_IP:-}" && -n "${TAP:-}" ]]; then
|
|
log "nat rules for ${GUEST_IP} via ${UPLINK}"
|
|
sudo iptables -t nat -S POSTROUTING | grep "${GUEST_IP}/32" || true
|
|
sudo iptables -S FORWARD | grep "$TAP" || true
|
|
fi
|
|
}
|
|
|
|
usage() {
|
|
cat <<'EOF'
|
|
Usage: ./verify.sh [--nat]
|
|
|
|
Run a basic smoke test for the Go VM workflow.
|
|
Use --nat to additionally verify outbound NAT and host rule cleanup.
|
|
EOF
|
|
}
|
|
|
|
NAT_ENABLED=0
|
|
BOOT_TIMEOUT_SECS="${VERIFY_BOOT_TIMEOUT_SECS:-90}"
|
|
if [[ "${1:-}" == "--nat" ]]; then
|
|
NAT_ENABLED=1
|
|
shift
|
|
fi
|
|
if (($# != 0)); then
|
|
usage
|
|
exit 1
|
|
fi
|
|
|
|
VM_NAME="verify-$(date +%s)"
|
|
VM_JSON=""
|
|
TAP=""
|
|
VM_DIR=""
|
|
GUEST_IP=""
|
|
UPLINK=""
|
|
API_SOCK=""
|
|
PID="0"
|
|
VM_STATE=""
|
|
LAST_ERROR=""
|
|
|
|
delete_vm() {
|
|
if [[ -n "${VM_NAME:-}" ]]; then
|
|
./banger vm delete "$VM_NAME"
|
|
fi
|
|
}
|
|
|
|
cleanup() {
|
|
if [[ -n "${VM_NAME:-}" ]]; then
|
|
./banger vm delete "$VM_NAME" >/dev/null 2>&1 || true
|
|
fi
|
|
}
|
|
|
|
trap cleanup EXIT
|
|
|
|
log "starting VM"
|
|
CREATE_ARGS=(./banger vm create --name "$VM_NAME")
|
|
if (( NAT_ENABLED )); then
|
|
CREATE_ARGS+=(--nat)
|
|
fi
|
|
"${CREATE_ARGS[@]}" >/dev/null
|
|
|
|
BOOT_DEADLINE=$((SECONDS + BOOT_TIMEOUT_SECS))
|
|
|
|
log "waiting for VM runtime readiness"
|
|
if wait_for_vm_ready "$BOOT_DEADLINE"; then
|
|
:
|
|
else
|
|
status=$?
|
|
case "$status" in
|
|
2) log "vm entered an error state before becoming ready" ;;
|
|
3) log "firecracker exited before the guest became ready" ;;
|
|
*) log "vm did not become ready before timeout" ;;
|
|
esac
|
|
dump_diagnostics
|
|
exit 1
|
|
fi
|
|
|
|
if (( NAT_ENABLED )); then
|
|
UPLINK="$(ip route show default 2>/dev/null | awk '/default/ {print $5; exit}')"
|
|
if [[ -z "$UPLINK" ]]; then
|
|
log "failed to detect uplink interface"
|
|
exit 1
|
|
fi
|
|
log "asserting NAT rules are installed"
|
|
sudo iptables -t nat -C POSTROUTING -s "${GUEST_IP}/32" -o "$UPLINK" -j MASQUERADE
|
|
sudo iptables -C FORWARD -i "$TAP" -o "$UPLINK" -j ACCEPT
|
|
sudo iptables -C FORWARD -i "$UPLINK" -o "$TAP" -m state --state RELATED,ESTABLISHED -j ACCEPT
|
|
fi
|
|
|
|
log "asserting VM is reachable via SSH"
|
|
if ! wait_for_ssh "$GUEST_IP" "$BOOT_DEADLINE"; then
|
|
log "ssh did not become ready for ${GUEST_IP}"
|
|
dump_diagnostics
|
|
exit 1
|
|
fi
|
|
ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
|
|
"root@${GUEST_IP}" "uname -a" >/dev/null
|
|
|
|
if (( NAT_ENABLED )); then
|
|
log "asserting VM has outbound network access"
|
|
ssh -i "$SSH_KEY" -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
|
|
"root@${GUEST_IP}" "curl -fsS https://example.com >/dev/null" >/dev/null
|
|
fi
|
|
|
|
log "cleaning up VM"
|
|
if ! delete_vm; then
|
|
log "vm delete failed for $VM_NAME"
|
|
dump_diagnostics
|
|
exit 1
|
|
fi
|
|
|
|
log "asserting cleanup success"
|
|
if ./banger vm show "$VM_NAME" >/dev/null 2>&1; then
|
|
log "vm still exists after delete: $VM_NAME"
|
|
exit 1
|
|
fi
|
|
if ip link show "$TAP" >/dev/null 2>&1; then
|
|
if pooled_tap "$TAP"; then
|
|
log "tap returned to idle pool: $TAP"
|
|
else
|
|
log "tap still exists: $TAP"
|
|
exit 1
|
|
fi
|
|
fi
|
|
if [[ -d "$VM_DIR" ]]; then
|
|
log "vm dir still exists: $VM_DIR"
|
|
exit 1
|
|
fi
|
|
if (( NAT_ENABLED )); then
|
|
if sudo iptables -t nat -C POSTROUTING -s "${GUEST_IP}/32" -o "$UPLINK" -j MASQUERADE 2>/dev/null; then
|
|
log "nat rule still exists for ${GUEST_IP}"
|
|
exit 1
|
|
fi
|
|
if sudo iptables -C FORWARD -i "$TAP" -o "$UPLINK" -j ACCEPT 2>/dev/null; then
|
|
log "forward-out rule still exists for ${TAP}"
|
|
exit 1
|
|
fi
|
|
if sudo iptables -C FORWARD -i "$UPLINK" -o "$TAP" -m state --state RELATED,ESTABLISHED -j ACCEPT 2>/dev/null; then
|
|
log "forward-in rule still exists for ${TAP}"
|
|
exit 1
|
|
fi
|
|
fi
|
|
|
|
log "ok"
|