From 60294e8c908fbfbfbc81bc5fbe58777628a4f867 Mon Sep 17 00:00:00 2001 From: Thales Maciel Date: Tue, 17 Mar 2026 14:43:09 -0300 Subject: [PATCH] Fix VM lifecycle issues behind verify.sh Make the Firecracker and bangerd processes outlive short-lived CLI request contexts so vm create no longer kills the VMM or daemon as soon as the RPC returns. Fix fresh-VM SSH by flattening the seeded /root work disk when the copied home tree lands under a nested root/ directory, and write a guest sshd override to keep root pubkey auth explicit while debugging. Harden teardown and smoke diagnostics: verify.sh now reports early Firecracker exit and delete failures directly, while dm snapshot cleanup tolerates already-gone handles and retries busy mapper removal long enough for Firecracker to release the device. Validation: go test ./..., make build, bash -n verify.sh, direct SSH against a fresh VM, and a live ./verify.sh run that now completes with [verify] ok. --- internal/cli/banger.go | 6 +++- internal/cli/cli_test.go | 11 +++++++ internal/daemon/snapshot.go | 40 +++++++++++++++++++--- internal/daemon/vm.go | 51 ++++++++++++++++++++++++----- internal/firecracker/client.go | 6 ++-- internal/firecracker/client_test.go | 6 ++-- verify.sh | 50 ++++++++++++++++++++++++++-- 7 files changed, 149 insertions(+), 21 deletions(-) diff --git a/internal/cli/banger.go b/internal/cli/banger.go index 8eb9550..4b337c5 100644 --- a/internal/cli/banger.go +++ b/internal/cli/banger.go @@ -641,7 +641,7 @@ func startDaemon(ctx context.Context, layout paths.Layout) error { if err != nil { return err } - cmd := exec.CommandContext(ctx, daemonBin) + cmd := buildDaemonCommand(daemonBin) cmd.Stdout = logFile cmd.Stderr = logFile cmd.Stdin = nil @@ -655,6 +655,10 @@ func startDaemon(ctx context.Context, layout paths.Layout) error { return nil } +func buildDaemonCommand(daemonBin string) *exec.Cmd { + return exec.Command(daemonBin) +} + func vmSetParamsFromFlags(idOrName string, vcpu, memory int, diskSize string, nat, noNat bool) (api.VMSetParams, error) { if nat && noNat { return api.VMSetParams{}, errors.New("use only one of --nat or --no-nat") diff --git a/internal/cli/cli_test.go b/internal/cli/cli_test.go index 6a25834..7bb153a 100644 --- a/internal/cli/cli_test.go +++ b/internal/cli/cli_test.go @@ -244,6 +244,17 @@ func TestDaemonStatusIncludesLogPathWhenStopped(t *testing.T) { } } +func TestBuildDaemonCommandIsDetachedFromCallerContext(t *testing.T) { + cmd := buildDaemonCommand("/tmp/bangerd") + + if cmd.Path != "/tmp/bangerd" { + t.Fatalf("command path = %q", cmd.Path) + } + if cmd.Cancel != nil { + t.Fatal("daemon process should not be tied to a CLI request context") + } +} + func TestAbsolutizeImageBuildPaths(t *testing.T) { dir := t.TempDir() prev, err := os.Getwd() diff --git a/internal/daemon/snapshot.go b/internal/daemon/snapshot.go index 1a3fda2..e184bdf 100644 --- a/internal/daemon/snapshot.go +++ b/internal/daemon/snapshot.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" "strings" + "time" ) type dmSnapshotHandles struct { @@ -55,25 +56,56 @@ func (d *Daemon) cleanupDMSnapshot(ctx context.Context, handles dmSnapshotHandle switch { case handles.DMName != "": - if _, err := d.runner.RunSudo(ctx, "dmsetup", "remove", handles.DMName); err != nil { + if err := d.removeDMSnapshot(ctx, handles.DMName); err != nil { cleanupErr = errors.Join(cleanupErr, err) } case handles.DMDev != "": - if _, err := d.runner.RunSudo(ctx, "dmsetup", "remove", handles.DMDev); err != nil { + if err := d.removeDMSnapshot(ctx, handles.DMDev); err != nil { cleanupErr = errors.Join(cleanupErr, err) } } if handles.COWLoop != "" { if _, err := d.runner.RunSudo(ctx, "losetup", "-d", handles.COWLoop); err != nil { - cleanupErr = errors.Join(cleanupErr, err) + if !isMissingSnapshotHandle(err) { + cleanupErr = errors.Join(cleanupErr, err) + } } } if handles.BaseLoop != "" { if _, err := d.runner.RunSudo(ctx, "losetup", "-d", handles.BaseLoop); err != nil { - cleanupErr = errors.Join(cleanupErr, err) + if !isMissingSnapshotHandle(err) { + cleanupErr = errors.Join(cleanupErr, err) + } } } return cleanupErr } + +func (d *Daemon) removeDMSnapshot(ctx context.Context, target string) error { + deadline := time.Now().Add(3 * time.Second) + for { + if _, err := d.runner.RunSudo(ctx, "dmsetup", "remove", target); err != nil { + if isMissingSnapshotHandle(err) { + return nil + } + if strings.Contains(err.Error(), "Device or resource busy") && time.Now().Before(deadline) { + time.Sleep(100 * time.Millisecond) + continue + } + return err + } + return nil + } +} + +func isMissingSnapshotHandle(err error) bool { + if err == nil { + return false + } + msg := err.Error() + return strings.Contains(msg, "No such device or address") || + strings.Contains(msg, "not found") || + strings.Contains(msg, "does not exist") +} diff --git a/internal/daemon/vm.go b/internal/daemon/vm.go index 27978be..3f3984c 100644 --- a/internal/daemon/vm.go +++ b/internal/daemon/vm.go @@ -238,7 +238,8 @@ func (d *Daemon) startVMLocked(ctx context.Context, vm model.VMRecord, image mod return cleanupOnErr(err) } op.stage("firecracker_launch", "log_path", vm.Runtime.LogPath, "metrics_path", vm.Runtime.MetricsPath) - machine, err := firecracker.NewMachine(ctx, firecracker.MachineConfig{ + firecrackerCtx := context.Background() + machine, err := firecracker.NewMachine(firecrackerCtx, firecracker.MachineConfig{ BinaryPath: fcPath, VMID: vm.ID, SocketPath: apiSock, @@ -257,11 +258,11 @@ func (d *Daemon) startVMLocked(ctx context.Context, vm model.VMRecord, image mod if err != nil { return cleanupOnErr(err) } - if err := machine.Start(ctx); err != nil { - vm.Runtime.PID = d.resolveFirecrackerPID(ctx, machine, apiSock) + if err := machine.Start(firecrackerCtx); err != nil { + vm.Runtime.PID = d.resolveFirecrackerPID(firecrackerCtx, machine, apiSock) return cleanupOnErr(err) } - vm.Runtime.PID = d.resolveFirecrackerPID(ctx, machine, apiSock) + vm.Runtime.PID = d.resolveFirecrackerPID(firecrackerCtx, machine, apiSock) op.debugStage("firecracker_started", "pid", vm.Runtime.PID) op.stage("socket_access", "api_socket", apiSock) if err := d.ensureSocketAccess(ctx, apiSock); err != nil { @@ -640,16 +641,25 @@ func (d *Daemon) patchRootOverlay(ctx context.Context, vm model.VMRecord, image resolv := []byte(fmt.Sprintf("nameserver %s\n", d.config.DefaultDNS)) hostname := []byte(vm.Name + "\n") hosts := []byte(fmt.Sprintf("127.0.0.1 localhost\n127.0.1.1 %s\n", vm.Name)) + sshdConfig := []byte(strings.Join([]string{ + "LogLevel DEBUG3", + "PermitRootLogin yes", + "PubkeyAuthentication yes", + "AuthorizedKeysFile /root/.ssh/authorized_keys", + "StrictModes no", + "", + }, "\n")) fstab, err := system.ReadDebugFSText(ctx, d.runner, vm.Runtime.DMDev, "/etc/fstab") if err != nil { fstab = "" } newFSTab := system.UpdateFSTab(fstab) for guestPath, data := range map[string][]byte{ - "/etc/resolv.conf": resolv, - "/etc/hostname": hostname, - "/etc/hosts": hosts, - "/etc/fstab": []byte(newFSTab), + "/etc/resolv.conf": resolv, + "/etc/hostname": hostname, + "/etc/hosts": hosts, + "/etc/fstab": []byte(newFSTab), + "/etc/ssh/sshd_config.d/99-banger.conf": sshdConfig, } { if err := system.WriteExt4File(ctx, d.runner, vm.Runtime.DMDev, guestPath, data); err != nil { return err @@ -681,9 +691,31 @@ func (d *Daemon) ensureWorkDisk(ctx context.Context, vm *model.VMRecord) error { if err := system.CopyDirContents(ctx, d.runner, filepath.Join(rootMount, "root"), workMount, true); err != nil { return err } + if err := d.flattenNestedWorkHome(ctx, workMount); err != nil { + return err + } return nil } +func (d *Daemon) flattenNestedWorkHome(ctx context.Context, workMount string) error { + nestedHome := filepath.Join(workMount, "root") + if !exists(nestedHome) { + return nil + } + script := `set -e +src="$1" +dst="$2" +for path in "$src"/.[!.]* "$src"/..?* "$src"/*; do + [ -e "$path" ] || continue + cp -a "$path" "$dst"/ +done` + if _, err := d.runner.RunSudo(ctx, "sh", "-c", script, "sh", nestedHome, workMount); err != nil { + return err + } + _, err := d.runner.RunSudo(ctx, "rm", "-rf", nestedHome) + return err +} + func (d *Daemon) ensureBridge(ctx context.Context) error { if _, err := d.runner.Run(ctx, "ip", "link", "show", d.config.BridgeName); err == nil { _, err = d.runner.RunSudo(ctx, "ip", "link", "set", d.config.BridgeName, "up") @@ -790,6 +822,9 @@ func (d *Daemon) cleanupRuntime(ctx context.Context, vm model.VMRecord, preserve } if vm.Runtime.PID > 0 && system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) { _ = d.killVMProcess(ctx, vm.Runtime.PID) + if err := d.waitForExit(ctx, vm.Runtime.PID, vm.Runtime.APISockPath, 30*time.Second); err != nil { + return err + } } if vm.Runtime.TapDevice != "" { _, _ = d.runner.RunSudo(ctx, "ip", "link", "del", vm.Runtime.TapDevice) diff --git a/internal/firecracker/client.go b/internal/firecracker/client.go index 171408e..ce0e3c0 100644 --- a/internal/firecracker/client.go +++ b/internal/firecracker/client.go @@ -47,7 +47,7 @@ func NewMachine(ctx context.Context, cfg MachineConfig) (*Machine, error) { return nil, err } - cmd := buildProcessRunner(ctx, cfg, logFile) + cmd := buildProcessRunner(cfg, logFile) machine, err := sdk.NewMachine( ctx, buildConfig(cfg), @@ -131,7 +131,7 @@ func buildConfig(cfg MachineConfig) sdk.Config { } } -func buildProcessRunner(ctx context.Context, cfg MachineConfig, logFile *os.File) *exec.Cmd { +func buildProcessRunner(cfg MachineConfig, logFile *os.File) *exec.Cmd { script := strings.Join([]string{ "umask 000", "exec " + shellQuote(cfg.BinaryPath) + @@ -139,7 +139,7 @@ func buildProcessRunner(ctx context.Context, cfg MachineConfig, logFile *os.File " --id " + shellQuote(cfg.VMID), }, " && ") - cmd := exec.CommandContext(ctx, "sudo", "-n", "sh", "-c", script) + cmd := exec.Command("sudo", "-n", "sh", "-c", script) cmd.Stdin = nil if logFile != nil { cmd.Stdout = logFile diff --git a/internal/firecracker/client_test.go b/internal/firecracker/client_test.go index 69ba7d3..218a21a 100644 --- a/internal/firecracker/client_test.go +++ b/internal/firecracker/client_test.go @@ -2,7 +2,6 @@ package firecracker import ( "bytes" - "context" "log/slog" "strings" "testing" @@ -60,7 +59,7 @@ func TestBuildConfig(t *testing.T) { } func TestBuildProcessRunnerUsesSudoWrapper(t *testing.T) { - cmd := buildProcessRunner(context.Background(), MachineConfig{ + cmd := buildProcessRunner(MachineConfig{ BinaryPath: "/repo/firecracker", SocketPath: "/tmp/fc.sock", VMID: "vm-1", @@ -78,6 +77,9 @@ func TestBuildProcessRunnerUsesSudoWrapper(t *testing.T) { if want := "umask 000 && exec '/repo/firecracker' --api-sock '/tmp/fc.sock' --id 'vm-1'"; cmd.Args[4] != want { t.Fatalf("script = %q, want %q", cmd.Args[4], want) } + if cmd.Cancel != nil { + t.Fatal("process runner should not be tied to a request context") + } } func TestSDKLoggerBridgeEmitsStructuredDebugLogs(t *testing.T) { diff --git a/verify.sh b/verify.sh index 015e3f5..7cf278b 100755 --- a/verify.sh +++ b/verify.sh @@ -21,6 +21,22 @@ if [[ ! -f "$SSH_KEY" ]]; then log "ssh key not found: $SSH_KEY" exit 1 fi +DAEMON_LOG="${XDG_STATE_HOME:-$HOME/.local/state}/banger/bangerd.log" + +firecracker_running() { + local pid="$1" + local api_sock="$2" + local cmdline="" + + if [[ -z "$pid" || "$pid" -le 0 || -z "$api_sock" ]]; then + return 1 + fi + if [[ ! -r "/proc/$pid/cmdline" ]]; then + return 1 + fi + cmdline="$(cat "/proc/$pid/cmdline" 2>/dev/null | tr '\0' ' ' || true)" + [[ "$cmdline" == *firecracker* && "$cmdline" == *"$api_sock"* ]] +} wait_for_ssh() { local guest_ip="$1" @@ -62,6 +78,9 @@ wait_for_vm_ready() { if [[ "$VM_STATE" == "error" || -n "$LAST_ERROR" ]]; then return 2 fi + if [[ -n "$API_SOCK" && "${PID:-0}" -gt 0 ]] && ! firecracker_running "$PID" "$API_SOCK"; then + return 3 + fi if [[ "$VM_STATE" == "running" && -n "$GUEST_IP" && -n "$TAP" && -n "$VM_DIR" && -n "$API_SOCK" && "${PID:-0}" -gt 0 ]]; then if [[ -S "$API_SOCK" ]] && ip link show "$TAP" >/dev/null 2>&1; then return 0 @@ -76,8 +95,16 @@ wait_for_vm_ready() { dump_diagnostics() { log "diagnostics for $VM_NAME" ./banger vm show "$VM_NAME" || true + if [[ "${PID:-0}" -gt 0 ]]; then + log "process state for pid $PID" + ps -fp "$PID" || true + fi log "recent firecracker log" ./banger vm logs "$VM_NAME" 2>/dev/null | tail -n 200 || true + if [[ -f "$DAEMON_LOG" ]]; then + log "recent daemon log" + tail -n 200 "$DAEMON_LOG" || true + fi if [[ -n "${TAP:-}" ]]; then log "tap state for $TAP" ip link show "$TAP" || true @@ -124,6 +151,12 @@ PID="0" VM_STATE="" LAST_ERROR="" +delete_vm() { + if [[ -n "${VM_NAME:-}" ]]; then + ./banger vm delete "$VM_NAME" + fi +} + cleanup() { if [[ -n "${VM_NAME:-}" ]]; then ./banger vm delete "$VM_NAME" >/dev/null 2>&1 || true @@ -142,8 +175,15 @@ fi BOOT_DEADLINE=$((SECONDS + BOOT_TIMEOUT_SECS)) log "waiting for VM runtime readiness" -if ! wait_for_vm_ready "$BOOT_DEADLINE"; then - log "vm did not become ready before timeout" +if wait_for_vm_ready "$BOOT_DEADLINE"; then + : +else + status=$? + case "$status" in + 2) log "vm entered an error state before becoming ready" ;; + 3) log "firecracker exited before the guest became ready" ;; + *) log "vm did not become ready before timeout" ;; + esac dump_diagnostics exit 1 fi @@ -176,7 +216,11 @@ if (( NAT_ENABLED )); then fi log "cleaning up VM" -cleanup +if ! delete_vm; then + log "vm delete failed for $VM_NAME" + dump_diagnostics + exit 1 +fi log "asserting cleanup success" if ./banger vm show "$VM_NAME" >/dev/null 2>&1; then