Fix VM lifecycle issues behind verify.sh
Make the Firecracker and bangerd processes outlive short-lived CLI request contexts so vm create no longer kills the VMM or daemon as soon as the RPC returns. Fix fresh-VM SSH by flattening the seeded /root work disk when the copied home tree lands under a nested root/ directory, and write a guest sshd override to keep root pubkey auth explicit while debugging. Harden teardown and smoke diagnostics: verify.sh now reports early Firecracker exit and delete failures directly, while dm snapshot cleanup tolerates already-gone handles and retries busy mapper removal long enough for Firecracker to release the device. Validation: go test ./..., make build, bash -n verify.sh, direct SSH against a fresh VM, and a live ./verify.sh run that now completes with [verify] ok.
This commit is contained in:
parent
617f677c9b
commit
60294e8c90
7 changed files with 149 additions and 21 deletions
|
|
@ -641,7 +641,7 @@ func startDaemon(ctx context.Context, layout paths.Layout) error {
|
|||
if err != nil {
|
||||
return err
|
||||
}
|
||||
cmd := exec.CommandContext(ctx, daemonBin)
|
||||
cmd := buildDaemonCommand(daemonBin)
|
||||
cmd.Stdout = logFile
|
||||
cmd.Stderr = logFile
|
||||
cmd.Stdin = nil
|
||||
|
|
@ -655,6 +655,10 @@ func startDaemon(ctx context.Context, layout paths.Layout) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
func buildDaemonCommand(daemonBin string) *exec.Cmd {
|
||||
return exec.Command(daemonBin)
|
||||
}
|
||||
|
||||
func vmSetParamsFromFlags(idOrName string, vcpu, memory int, diskSize string, nat, noNat bool) (api.VMSetParams, error) {
|
||||
if nat && noNat {
|
||||
return api.VMSetParams{}, errors.New("use only one of --nat or --no-nat")
|
||||
|
|
|
|||
|
|
@ -244,6 +244,17 @@ func TestDaemonStatusIncludesLogPathWhenStopped(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestBuildDaemonCommandIsDetachedFromCallerContext(t *testing.T) {
|
||||
cmd := buildDaemonCommand("/tmp/bangerd")
|
||||
|
||||
if cmd.Path != "/tmp/bangerd" {
|
||||
t.Fatalf("command path = %q", cmd.Path)
|
||||
}
|
||||
if cmd.Cancel != nil {
|
||||
t.Fatal("daemon process should not be tied to a CLI request context")
|
||||
}
|
||||
}
|
||||
|
||||
func TestAbsolutizeImageBuildPaths(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
prev, err := os.Getwd()
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ import (
|
|||
"errors"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type dmSnapshotHandles struct {
|
||||
|
|
@ -55,25 +56,56 @@ func (d *Daemon) cleanupDMSnapshot(ctx context.Context, handles dmSnapshotHandle
|
|||
|
||||
switch {
|
||||
case handles.DMName != "":
|
||||
if _, err := d.runner.RunSudo(ctx, "dmsetup", "remove", handles.DMName); err != nil {
|
||||
if err := d.removeDMSnapshot(ctx, handles.DMName); err != nil {
|
||||
cleanupErr = errors.Join(cleanupErr, err)
|
||||
}
|
||||
case handles.DMDev != "":
|
||||
if _, err := d.runner.RunSudo(ctx, "dmsetup", "remove", handles.DMDev); err != nil {
|
||||
if err := d.removeDMSnapshot(ctx, handles.DMDev); err != nil {
|
||||
cleanupErr = errors.Join(cleanupErr, err)
|
||||
}
|
||||
}
|
||||
|
||||
if handles.COWLoop != "" {
|
||||
if _, err := d.runner.RunSudo(ctx, "losetup", "-d", handles.COWLoop); err != nil {
|
||||
cleanupErr = errors.Join(cleanupErr, err)
|
||||
if !isMissingSnapshotHandle(err) {
|
||||
cleanupErr = errors.Join(cleanupErr, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
if handles.BaseLoop != "" {
|
||||
if _, err := d.runner.RunSudo(ctx, "losetup", "-d", handles.BaseLoop); err != nil {
|
||||
cleanupErr = errors.Join(cleanupErr, err)
|
||||
if !isMissingSnapshotHandle(err) {
|
||||
cleanupErr = errors.Join(cleanupErr, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return cleanupErr
|
||||
}
|
||||
|
||||
func (d *Daemon) removeDMSnapshot(ctx context.Context, target string) error {
|
||||
deadline := time.Now().Add(3 * time.Second)
|
||||
for {
|
||||
if _, err := d.runner.RunSudo(ctx, "dmsetup", "remove", target); err != nil {
|
||||
if isMissingSnapshotHandle(err) {
|
||||
return nil
|
||||
}
|
||||
if strings.Contains(err.Error(), "Device or resource busy") && time.Now().Before(deadline) {
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
continue
|
||||
}
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
func isMissingSnapshotHandle(err error) bool {
|
||||
if err == nil {
|
||||
return false
|
||||
}
|
||||
msg := err.Error()
|
||||
return strings.Contains(msg, "No such device or address") ||
|
||||
strings.Contains(msg, "not found") ||
|
||||
strings.Contains(msg, "does not exist")
|
||||
}
|
||||
|
|
|
|||
|
|
@ -238,7 +238,8 @@ func (d *Daemon) startVMLocked(ctx context.Context, vm model.VMRecord, image mod
|
|||
return cleanupOnErr(err)
|
||||
}
|
||||
op.stage("firecracker_launch", "log_path", vm.Runtime.LogPath, "metrics_path", vm.Runtime.MetricsPath)
|
||||
machine, err := firecracker.NewMachine(ctx, firecracker.MachineConfig{
|
||||
firecrackerCtx := context.Background()
|
||||
machine, err := firecracker.NewMachine(firecrackerCtx, firecracker.MachineConfig{
|
||||
BinaryPath: fcPath,
|
||||
VMID: vm.ID,
|
||||
SocketPath: apiSock,
|
||||
|
|
@ -257,11 +258,11 @@ func (d *Daemon) startVMLocked(ctx context.Context, vm model.VMRecord, image mod
|
|||
if err != nil {
|
||||
return cleanupOnErr(err)
|
||||
}
|
||||
if err := machine.Start(ctx); err != nil {
|
||||
vm.Runtime.PID = d.resolveFirecrackerPID(ctx, machine, apiSock)
|
||||
if err := machine.Start(firecrackerCtx); err != nil {
|
||||
vm.Runtime.PID = d.resolveFirecrackerPID(firecrackerCtx, machine, apiSock)
|
||||
return cleanupOnErr(err)
|
||||
}
|
||||
vm.Runtime.PID = d.resolveFirecrackerPID(ctx, machine, apiSock)
|
||||
vm.Runtime.PID = d.resolveFirecrackerPID(firecrackerCtx, machine, apiSock)
|
||||
op.debugStage("firecracker_started", "pid", vm.Runtime.PID)
|
||||
op.stage("socket_access", "api_socket", apiSock)
|
||||
if err := d.ensureSocketAccess(ctx, apiSock); err != nil {
|
||||
|
|
@ -640,16 +641,25 @@ func (d *Daemon) patchRootOverlay(ctx context.Context, vm model.VMRecord, image
|
|||
resolv := []byte(fmt.Sprintf("nameserver %s\n", d.config.DefaultDNS))
|
||||
hostname := []byte(vm.Name + "\n")
|
||||
hosts := []byte(fmt.Sprintf("127.0.0.1 localhost\n127.0.1.1 %s\n", vm.Name))
|
||||
sshdConfig := []byte(strings.Join([]string{
|
||||
"LogLevel DEBUG3",
|
||||
"PermitRootLogin yes",
|
||||
"PubkeyAuthentication yes",
|
||||
"AuthorizedKeysFile /root/.ssh/authorized_keys",
|
||||
"StrictModes no",
|
||||
"",
|
||||
}, "\n"))
|
||||
fstab, err := system.ReadDebugFSText(ctx, d.runner, vm.Runtime.DMDev, "/etc/fstab")
|
||||
if err != nil {
|
||||
fstab = ""
|
||||
}
|
||||
newFSTab := system.UpdateFSTab(fstab)
|
||||
for guestPath, data := range map[string][]byte{
|
||||
"/etc/resolv.conf": resolv,
|
||||
"/etc/hostname": hostname,
|
||||
"/etc/hosts": hosts,
|
||||
"/etc/fstab": []byte(newFSTab),
|
||||
"/etc/resolv.conf": resolv,
|
||||
"/etc/hostname": hostname,
|
||||
"/etc/hosts": hosts,
|
||||
"/etc/fstab": []byte(newFSTab),
|
||||
"/etc/ssh/sshd_config.d/99-banger.conf": sshdConfig,
|
||||
} {
|
||||
if err := system.WriteExt4File(ctx, d.runner, vm.Runtime.DMDev, guestPath, data); err != nil {
|
||||
return err
|
||||
|
|
@ -681,9 +691,31 @@ func (d *Daemon) ensureWorkDisk(ctx context.Context, vm *model.VMRecord) error {
|
|||
if err := system.CopyDirContents(ctx, d.runner, filepath.Join(rootMount, "root"), workMount, true); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := d.flattenNestedWorkHome(ctx, workMount); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (d *Daemon) flattenNestedWorkHome(ctx context.Context, workMount string) error {
|
||||
nestedHome := filepath.Join(workMount, "root")
|
||||
if !exists(nestedHome) {
|
||||
return nil
|
||||
}
|
||||
script := `set -e
|
||||
src="$1"
|
||||
dst="$2"
|
||||
for path in "$src"/.[!.]* "$src"/..?* "$src"/*; do
|
||||
[ -e "$path" ] || continue
|
||||
cp -a "$path" "$dst"/
|
||||
done`
|
||||
if _, err := d.runner.RunSudo(ctx, "sh", "-c", script, "sh", nestedHome, workMount); err != nil {
|
||||
return err
|
||||
}
|
||||
_, err := d.runner.RunSudo(ctx, "rm", "-rf", nestedHome)
|
||||
return err
|
||||
}
|
||||
|
||||
func (d *Daemon) ensureBridge(ctx context.Context) error {
|
||||
if _, err := d.runner.Run(ctx, "ip", "link", "show", d.config.BridgeName); err == nil {
|
||||
_, err = d.runner.RunSudo(ctx, "ip", "link", "set", d.config.BridgeName, "up")
|
||||
|
|
@ -790,6 +822,9 @@ func (d *Daemon) cleanupRuntime(ctx context.Context, vm model.VMRecord, preserve
|
|||
}
|
||||
if vm.Runtime.PID > 0 && system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
|
||||
_ = d.killVMProcess(ctx, vm.Runtime.PID)
|
||||
if err := d.waitForExit(ctx, vm.Runtime.PID, vm.Runtime.APISockPath, 30*time.Second); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if vm.Runtime.TapDevice != "" {
|
||||
_, _ = d.runner.RunSudo(ctx, "ip", "link", "del", vm.Runtime.TapDevice)
|
||||
|
|
|
|||
|
|
@ -47,7 +47,7 @@ func NewMachine(ctx context.Context, cfg MachineConfig) (*Machine, error) {
|
|||
return nil, err
|
||||
}
|
||||
|
||||
cmd := buildProcessRunner(ctx, cfg, logFile)
|
||||
cmd := buildProcessRunner(cfg, logFile)
|
||||
machine, err := sdk.NewMachine(
|
||||
ctx,
|
||||
buildConfig(cfg),
|
||||
|
|
@ -131,7 +131,7 @@ func buildConfig(cfg MachineConfig) sdk.Config {
|
|||
}
|
||||
}
|
||||
|
||||
func buildProcessRunner(ctx context.Context, cfg MachineConfig, logFile *os.File) *exec.Cmd {
|
||||
func buildProcessRunner(cfg MachineConfig, logFile *os.File) *exec.Cmd {
|
||||
script := strings.Join([]string{
|
||||
"umask 000",
|
||||
"exec " + shellQuote(cfg.BinaryPath) +
|
||||
|
|
@ -139,7 +139,7 @@ func buildProcessRunner(ctx context.Context, cfg MachineConfig, logFile *os.File
|
|||
" --id " + shellQuote(cfg.VMID),
|
||||
}, " && ")
|
||||
|
||||
cmd := exec.CommandContext(ctx, "sudo", "-n", "sh", "-c", script)
|
||||
cmd := exec.Command("sudo", "-n", "sh", "-c", script)
|
||||
cmd.Stdin = nil
|
||||
if logFile != nil {
|
||||
cmd.Stdout = logFile
|
||||
|
|
|
|||
|
|
@ -2,7 +2,6 @@ package firecracker
|
|||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"log/slog"
|
||||
"strings"
|
||||
"testing"
|
||||
|
|
@ -60,7 +59,7 @@ func TestBuildConfig(t *testing.T) {
|
|||
}
|
||||
|
||||
func TestBuildProcessRunnerUsesSudoWrapper(t *testing.T) {
|
||||
cmd := buildProcessRunner(context.Background(), MachineConfig{
|
||||
cmd := buildProcessRunner(MachineConfig{
|
||||
BinaryPath: "/repo/firecracker",
|
||||
SocketPath: "/tmp/fc.sock",
|
||||
VMID: "vm-1",
|
||||
|
|
@ -78,6 +77,9 @@ func TestBuildProcessRunnerUsesSudoWrapper(t *testing.T) {
|
|||
if want := "umask 000 && exec '/repo/firecracker' --api-sock '/tmp/fc.sock' --id 'vm-1'"; cmd.Args[4] != want {
|
||||
t.Fatalf("script = %q, want %q", cmd.Args[4], want)
|
||||
}
|
||||
if cmd.Cancel != nil {
|
||||
t.Fatal("process runner should not be tied to a request context")
|
||||
}
|
||||
}
|
||||
|
||||
func TestSDKLoggerBridgeEmitsStructuredDebugLogs(t *testing.T) {
|
||||
|
|
|
|||
50
verify.sh
50
verify.sh
|
|
@ -21,6 +21,22 @@ if [[ ! -f "$SSH_KEY" ]]; then
|
|||
log "ssh key not found: $SSH_KEY"
|
||||
exit 1
|
||||
fi
|
||||
DAEMON_LOG="${XDG_STATE_HOME:-$HOME/.local/state}/banger/bangerd.log"
|
||||
|
||||
firecracker_running() {
|
||||
local pid="$1"
|
||||
local api_sock="$2"
|
||||
local cmdline=""
|
||||
|
||||
if [[ -z "$pid" || "$pid" -le 0 || -z "$api_sock" ]]; then
|
||||
return 1
|
||||
fi
|
||||
if [[ ! -r "/proc/$pid/cmdline" ]]; then
|
||||
return 1
|
||||
fi
|
||||
cmdline="$(cat "/proc/$pid/cmdline" 2>/dev/null | tr '\0' ' ' || true)"
|
||||
[[ "$cmdline" == *firecracker* && "$cmdline" == *"$api_sock"* ]]
|
||||
}
|
||||
|
||||
wait_for_ssh() {
|
||||
local guest_ip="$1"
|
||||
|
|
@ -62,6 +78,9 @@ wait_for_vm_ready() {
|
|||
if [[ "$VM_STATE" == "error" || -n "$LAST_ERROR" ]]; then
|
||||
return 2
|
||||
fi
|
||||
if [[ -n "$API_SOCK" && "${PID:-0}" -gt 0 ]] && ! firecracker_running "$PID" "$API_SOCK"; then
|
||||
return 3
|
||||
fi
|
||||
if [[ "$VM_STATE" == "running" && -n "$GUEST_IP" && -n "$TAP" && -n "$VM_DIR" && -n "$API_SOCK" && "${PID:-0}" -gt 0 ]]; then
|
||||
if [[ -S "$API_SOCK" ]] && ip link show "$TAP" >/dev/null 2>&1; then
|
||||
return 0
|
||||
|
|
@ -76,8 +95,16 @@ wait_for_vm_ready() {
|
|||
dump_diagnostics() {
|
||||
log "diagnostics for $VM_NAME"
|
||||
./banger vm show "$VM_NAME" || true
|
||||
if [[ "${PID:-0}" -gt 0 ]]; then
|
||||
log "process state for pid $PID"
|
||||
ps -fp "$PID" || true
|
||||
fi
|
||||
log "recent firecracker log"
|
||||
./banger vm logs "$VM_NAME" 2>/dev/null | tail -n 200 || true
|
||||
if [[ -f "$DAEMON_LOG" ]]; then
|
||||
log "recent daemon log"
|
||||
tail -n 200 "$DAEMON_LOG" || true
|
||||
fi
|
||||
if [[ -n "${TAP:-}" ]]; then
|
||||
log "tap state for $TAP"
|
||||
ip link show "$TAP" || true
|
||||
|
|
@ -124,6 +151,12 @@ PID="0"
|
|||
VM_STATE=""
|
||||
LAST_ERROR=""
|
||||
|
||||
delete_vm() {
|
||||
if [[ -n "${VM_NAME:-}" ]]; then
|
||||
./banger vm delete "$VM_NAME"
|
||||
fi
|
||||
}
|
||||
|
||||
cleanup() {
|
||||
if [[ -n "${VM_NAME:-}" ]]; then
|
||||
./banger vm delete "$VM_NAME" >/dev/null 2>&1 || true
|
||||
|
|
@ -142,8 +175,15 @@ fi
|
|||
BOOT_DEADLINE=$((SECONDS + BOOT_TIMEOUT_SECS))
|
||||
|
||||
log "waiting for VM runtime readiness"
|
||||
if ! wait_for_vm_ready "$BOOT_DEADLINE"; then
|
||||
log "vm did not become ready before timeout"
|
||||
if wait_for_vm_ready "$BOOT_DEADLINE"; then
|
||||
:
|
||||
else
|
||||
status=$?
|
||||
case "$status" in
|
||||
2) log "vm entered an error state before becoming ready" ;;
|
||||
3) log "firecracker exited before the guest became ready" ;;
|
||||
*) log "vm did not become ready before timeout" ;;
|
||||
esac
|
||||
dump_diagnostics
|
||||
exit 1
|
||||
fi
|
||||
|
|
@ -176,7 +216,11 @@ if (( NAT_ENABLED )); then
|
|||
fi
|
||||
|
||||
log "cleaning up VM"
|
||||
cleanup
|
||||
if ! delete_vm; then
|
||||
log "vm delete failed for $VM_NAME"
|
||||
dump_diagnostics
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log "asserting cleanup success"
|
||||
if ./banger vm show "$VM_NAME" >/dev/null 2>&1; then
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue