banger/verify.sh
Thales Maciel 30f0c0b54a
Manage image artifacts and show VM create progress
Stop relying on ad hoc rootfs handling by adding image promotion, managed work-seed fingerprint metadata, and lazy self-healing for older managed images after the first create.

Rebuild guest images with baked SSH access, a guest NIC bootstrap, and default opencode services, and add the staged Void kernel/initramfs/modules workflow so void-exp uses a matching Void boot stack.

Replace the opaque blocking vm.create RPC with a begin/status flow that prints live stages in the CLI while still waiting for vsock health and opencode on guest port 4096.

Validate with GOCACHE=/tmp/banger-gocache go test ./... and live void-exp create/delete smoke runs.
2026-03-21 14:48:01 -03:00

320 lines
8.1 KiB
Bash
Executable file

#!/usr/bin/env bash
set -euo pipefail
log() {
printf '[verify] %s\n' "$*"
}
DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
DEFAULT_RUNTIME_DIR="$DIR"
if [[ -d "$DIR/runtime" ]]; then
DEFAULT_RUNTIME_DIR="$DIR/runtime"
fi
RUNTIME_DIR="${BANGER_RUNTIME_DIR:-$DEFAULT_RUNTIME_DIR}"
SSH_KEY="$RUNTIME_DIR/id_ed25519"
if [[ ! -d "$RUNTIME_DIR" ]]; then
log "runtime bundle not found: $RUNTIME_DIR"
log "run 'make runtime-bundle' or set BANGER_RUNTIME_DIR"
exit 1
fi
if [[ ! -f "$SSH_KEY" ]]; then
log "ssh key not found: $SSH_KEY"
exit 1
fi
DAEMON_LOG="${XDG_STATE_HOME:-$HOME/.local/state}/banger/bangerd.log"
SSH_COMMON_ARGS=(
-F /dev/null
-i "$SSH_KEY"
-o IdentitiesOnly=yes
-o BatchMode=yes
-o PreferredAuthentications=publickey
-o PasswordAuthentication=no
-o KbdInteractiveAuthentication=no
-o StrictHostKeyChecking=no
-o UserKnownHostsFile=/dev/null
)
OPENCODE_PORT=4096
firecracker_running() {
local pid="$1"
local api_sock="$2"
local cmdline=""
if [[ -z "$pid" || "$pid" -le 0 || -z "$api_sock" ]]; then
return 1
fi
if [[ ! -r "/proc/$pid/cmdline" ]]; then
return 1
fi
cmdline="$(cat "/proc/$pid/cmdline" 2>/dev/null | tr '\0' ' ' || true)"
[[ "$cmdline" == *firecracker* && "$cmdline" == *"$api_sock"* ]]
}
pooled_tap() {
local tap="$1"
[[ "$tap" == tap-pool-* ]]
}
wait_for_ssh() {
local guest_ip="$1"
local deadline="$2"
while ((SECONDS < deadline)); do
if ssh "${SSH_COMMON_ARGS[@]}" -o ConnectTimeout=2 "root@${guest_ip}" "true" >/dev/null 2>&1; then
return 0
fi
sleep 1
done
return 1
}
wait_for_tcp() {
local host="$1"
local port="$2"
local deadline="$3"
while ((SECONDS < deadline)); do
if (exec 3<>/dev/tcp/"$host"/"$port") >/dev/null 2>&1; then
return 0
fi
sleep 1
done
return 1
}
refresh_vm_metadata() {
if ! VM_JSON="$(./banger vm show "$VM_NAME" 2>/dev/null)"; then
return 1
fi
TAP="$(printf '%s\n' "$VM_JSON" | jq -r '.runtime.tap_device // empty')"
VM_DIR="$(printf '%s\n' "$VM_JSON" | jq -r '.runtime.vm_dir // empty')"
GUEST_IP="$(printf '%s\n' "$VM_JSON" | jq -r '.runtime.guest_ip // empty')"
API_SOCK="$(printf '%s\n' "$VM_JSON" | jq -r '.runtime.api_sock_path // empty')"
PID="$(printf '%s\n' "$VM_JSON" | jq -r '.runtime.pid // 0')"
VM_STATE="$(printf '%s\n' "$VM_JSON" | jq -r '.state // empty')"
LAST_ERROR="$(printf '%s\n' "$VM_JSON" | jq -r '.runtime.last_error // empty')"
return 0
}
wait_for_vm_ready() {
local deadline="$1"
while ((SECONDS < deadline)); do
if ! refresh_vm_metadata; then
sleep 1
continue
fi
if [[ "$VM_STATE" == "error" || -n "$LAST_ERROR" ]]; then
return 2
fi
if [[ -n "$API_SOCK" && "${PID:-0}" -gt 0 ]] && ! firecracker_running "$PID" "$API_SOCK"; then
return 3
fi
if [[ "$VM_STATE" == "running" && -n "$GUEST_IP" && -n "$TAP" && -n "$VM_DIR" && -n "$API_SOCK" && "${PID:-0}" -gt 0 ]]; then
if [[ -S "$API_SOCK" ]] && ip link show "$TAP" >/dev/null 2>&1; then
return 0
fi
fi
sleep 1
done
return 1
}
dump_diagnostics() {
log "diagnostics for $VM_NAME"
./banger vm show "$VM_NAME" || true
if [[ "${PID:-0}" -gt 0 ]]; then
log "process state for pid $PID"
ps -fp "$PID" || true
fi
log "recent firecracker log"
./banger vm logs "$VM_NAME" 2>/dev/null | tail -n 200 || true
if [[ -f "$DAEMON_LOG" ]]; then
log "recent daemon log"
tail -n 200 "$DAEMON_LOG" || true
fi
if [[ -n "${TAP:-}" ]]; then
log "tap state for $TAP"
ip link show "$TAP" || true
fi
if [[ -n "${API_SOCK:-}" ]]; then
log "api socket $API_SOCK"
ls -l "$API_SOCK" 2>/dev/null || true
fi
if (( NAT_ENABLED )) && [[ -n "${UPLINK:-}" && -n "${GUEST_IP:-}" && -n "${TAP:-}" ]]; then
log "nat rules for ${GUEST_IP} via ${UPLINK}"
sudo iptables -t nat -S POSTROUTING | grep "${GUEST_IP}/32" || true
sudo iptables -S FORWARD | grep "$TAP" || true
fi
}
usage() {
cat <<'EOF'
Usage: ./verify.sh [--nat] [--image <name>]
Run a basic smoke test for the Go VM workflow.
Use --nat to additionally verify outbound NAT and host rule cleanup.
Use --image to verify a non-default image such as void-exp.
EOF
}
NAT_ENABLED=0
IMAGE_NAME=""
BOOT_TIMEOUT_SECS="${VERIFY_BOOT_TIMEOUT_SECS:-90}"
while [[ $# -gt 0 ]]; do
case "$1" in
--nat)
NAT_ENABLED=1
shift
;;
--image)
IMAGE_NAME="${2:-}"
if [[ -z "$IMAGE_NAME" ]]; then
usage
exit 1
fi
shift 2
;;
*)
usage
exit 1
;;
esac
done
VM_NAME="verify-$(date +%s)"
VM_JSON=""
TAP=""
VM_DIR=""
GUEST_IP=""
UPLINK=""
API_SOCK=""
PID="0"
VM_STATE=""
LAST_ERROR=""
delete_vm() {
if [[ -n "${VM_NAME:-}" ]]; then
./banger vm delete "$VM_NAME"
fi
}
cleanup() {
if [[ -n "${VM_NAME:-}" ]]; then
./banger vm delete "$VM_NAME" >/dev/null 2>&1 || true
fi
}
trap cleanup EXIT
log "starting VM"
CREATE_ARGS=(./banger vm create --name "$VM_NAME")
if [[ -n "$IMAGE_NAME" ]]; then
CREATE_ARGS+=(--image "$IMAGE_NAME")
fi
if (( NAT_ENABLED )); then
CREATE_ARGS+=(--nat)
fi
"${CREATE_ARGS[@]}" >/dev/null
BOOT_DEADLINE=$((SECONDS + BOOT_TIMEOUT_SECS))
log "waiting for VM runtime readiness"
if wait_for_vm_ready "$BOOT_DEADLINE"; then
:
else
status=$?
case "$status" in
2) log "vm entered an error state before becoming ready" ;;
3) log "firecracker exited before the guest became ready" ;;
*) log "vm did not become ready before timeout" ;;
esac
dump_diagnostics
exit 1
fi
if (( NAT_ENABLED )); then
UPLINK="$(ip route show default 2>/dev/null | awk '/default/ {print $5; exit}')"
if [[ -z "$UPLINK" ]]; then
log "failed to detect uplink interface"
exit 1
fi
log "asserting NAT rules are installed"
sudo iptables -t nat -C POSTROUTING -s "${GUEST_IP}/32" -o "$UPLINK" -j MASQUERADE
sudo iptables -C FORWARD -i "$TAP" -o "$UPLINK" -j ACCEPT
sudo iptables -C FORWARD -i "$UPLINK" -o "$TAP" -m state --state RELATED,ESTABLISHED -j ACCEPT
fi
log "asserting VM is reachable via SSH"
if ! wait_for_ssh "$GUEST_IP" "$BOOT_DEADLINE"; then
log "ssh did not become ready for ${GUEST_IP}"
dump_diagnostics
exit 1
fi
ssh "${SSH_COMMON_ARGS[@]}" "root@${GUEST_IP}" "uname -a" >/dev/null
log "asserting opencode is available and listening in the guest"
ssh "${SSH_COMMON_ARGS[@]}" "root@${GUEST_IP}" "command -v opencode >/dev/null 2>&1 && ss -H -lntp | awk '\$4 ~ /:${OPENCODE_PORT}\$/ { found = 1 } END { exit found ? 0 : 1 }'" >/dev/null
log "asserting opencode server is reachable from the host"
if ! wait_for_tcp "$GUEST_IP" "$OPENCODE_PORT" "$BOOT_DEADLINE"; then
log "opencode server did not become reachable at ${GUEST_IP}:${OPENCODE_PORT}"
dump_diagnostics
exit 1
fi
log "asserting opencode port is reported by banger vm ports"
if ! ./banger vm ports "$VM_NAME" | grep -F ":${OPENCODE_PORT}" >/dev/null 2>&1; then
log "banger vm ports did not report ${OPENCODE_PORT}"
dump_diagnostics
exit 1
fi
if (( NAT_ENABLED )); then
log "asserting VM has outbound network access"
ssh "${SSH_COMMON_ARGS[@]}" "root@${GUEST_IP}" "curl -fsS https://example.com >/dev/null" >/dev/null
fi
log "cleaning up VM"
if ! delete_vm; then
log "vm delete failed for $VM_NAME"
dump_diagnostics
exit 1
fi
log "asserting cleanup success"
if ./banger vm show "$VM_NAME" >/dev/null 2>&1; then
log "vm still exists after delete: $VM_NAME"
exit 1
fi
if ip link show "$TAP" >/dev/null 2>&1; then
if pooled_tap "$TAP"; then
log "tap returned to idle pool: $TAP"
else
log "tap still exists: $TAP"
exit 1
fi
fi
if [[ -d "$VM_DIR" ]]; then
log "vm dir still exists: $VM_DIR"
exit 1
fi
if (( NAT_ENABLED )); then
if sudo iptables -t nat -C POSTROUTING -s "${GUEST_IP}/32" -o "$UPLINK" -j MASQUERADE 2>/dev/null; then
log "nat rule still exists for ${GUEST_IP}"
exit 1
fi
if sudo iptables -C FORWARD -i "$TAP" -o "$UPLINK" -j ACCEPT 2>/dev/null; then
log "forward-out rule still exists for ${TAP}"
exit 1
fi
if sudo iptables -C FORWARD -i "$UPLINK" -o "$TAP" -m state --state RELATED,ESTABLISHED -j ACCEPT 2>/dev/null; then
log "forward-in rule still exists for ${TAP}"
exit 1
fi
fi
log "ok"