diff --git a/internal/daemon/vm_lifecycle.go b/internal/daemon/vm_lifecycle.go index 6f4fcf5..e759bc6 100644 --- a/internal/daemon/vm_lifecycle.go +++ b/internal/daemon/vm_lifecycle.go @@ -3,12 +3,15 @@ package daemon import ( "context" "errors" + "io" + "net" "os" "path/filepath" "strings" "time" "banger/internal/api" + "banger/internal/guest" "banger/internal/model" "banger/internal/system" ) @@ -130,8 +133,35 @@ func (s *VMService) stopVMLocked(ctx context.Context, current model.VMRecord) (v } pid := s.vmHandles(vm.ID).PID op.stage("graceful_shutdown") - if err := s.net.sendCtrlAltDel(ctx, vm.Runtime.APISockPath); err != nil { - return model.VMRecord{}, err + // Reach into the guest over SSH to force a sync + queue a poweroff + // before falling back on FC's SendCtrlAltDel. The sync is what + // keeps stop() from losing data: every dirty page the guest hasn't + // flushed through virtio-blk to the work disk is written out + // before this RPC returns. Without it, files freshly created via + // `vm workspace prepare` can disappear across stop+start, because + // the 10-second wait_for_exit window expires (FC doesn't exit on + // SendCtrlAltDel — Debian routes ctrl-alt-del.target → reboot.target, + // not poweroff) and the fallback SIGKILL drops everything still + // in FC's userspace I/O path. + // + // `systemctl --no-block poweroff` is queued for the same reason + // SendCtrlAltDel was here originally — it's how stop() asks the + // guest to halt. That request is best-effort; FC may or may not + // exit before the SIGKILL fallback fires. Either way, sync + // already ran, so the on-host root.ext4 is consistent regardless. + // + // SendCtrlAltDel survives as a fallback for guests where SSH + // itself is unreachable (broken sshd, network down, drifted host + // key); it doesn't fix the data-loss path, but it's the existing + // last-resort signal and is at least no worse than today. + if err := s.requestGuestPoweroff(ctx, vm); err != nil { + if s.logger != nil { + s.logger.Warn("guest ssh poweroff failed; falling back to ctrl+alt+del", + append(vmLogAttrs(vm), "error", err.Error())...) + } + if fallbackErr := s.net.sendCtrlAltDel(ctx, vm.Runtime.APISockPath); fallbackErr != nil { + return model.VMRecord{}, fallbackErr + } } op.stage("wait_for_exit", "pid", pid) if err := s.net.waitForExit(ctx, pid, vm.Runtime.APISockPath, gracefulShutdownWait); err != nil { @@ -155,6 +185,39 @@ func (s *VMService) stopVMLocked(ctx context.Context, current model.VMRecord) (v return vm, nil } +// requestGuestPoweroff dials the guest over SSH and runs a sync + +// queues a poweroff job. The sync is the load-bearing piece — see the +// comment in stopVMLocked. Returns the dial / SSH error if the guest +// is unreachable; the caller treats that as a fallback signal. +// +// Bounded by a hard 5-second SSH-dial timeout so a half-broken guest +// doesn't extend the overall stop window past the existing +// gracefulShutdownWait. If the dial doesn't succeed in that window we +// surface an error and let the caller take the SendCtrlAltDel path. +func (s *VMService) requestGuestPoweroff(ctx context.Context, vm model.VMRecord) error { + guestIP := strings.TrimSpace(vm.Runtime.GuestIP) + if guestIP == "" { + return errors.New("guest IP unknown") + } + dialCtx, cancel := context.WithTimeout(ctx, 5*time.Second) + defer cancel() + address := net.JoinHostPort(guestIP, "22") + client, err := guest.Dial(dialCtx, address, s.config.SSHKeyPath, s.layout.KnownHostsPath) + if err != nil { + return err + } + defer client.Close() + // `sync` runs synchronously and blocks RunScript until every dirty + // page hits virtio-blk → root.ext4. That's the persistence + // guarantee. The `systemctl --no-block poweroff` queues a job and + // returns; whether poweroff.target completes before the SIGKILL + // fallback fires is incidental — by then sync has already done + // its work. The `|| /sbin/poweroff -f` is the last-ditch fallback + // when systemd itself is wedged. + const script = "sync; systemctl --no-block poweroff || /sbin/poweroff -f &" + return client.RunScript(ctx, script, io.Discard) +} + func (s *VMService) KillVM(ctx context.Context, params api.VMKillParams) (model.VMRecord, error) { return s.withVMLockByRef(ctx, params.IDOrName, func(vm model.VMRecord) (model.VMRecord, error) { return s.killVMLocked(ctx, vm, params.Signal) diff --git a/scripts/smoke.sh b/scripts/smoke.sh index 53022b8..d52dee4 100644 --- a/scripts/smoke.sh +++ b/scripts/smoke.sh @@ -103,7 +103,7 @@ cleanup() { set +e for vm in \ smoke-lifecycle smoke-set smoke-restart smoke-kill smoke-ports smoke-fc \ - smoke-basecommit smoke-nat smoke-nocnat; do + smoke-basecommit smoke-exec smoke-wsrestart smoke-nat smoke-nocnat; do "$BANGER" vm delete "$vm" >/dev/null 2>&1 || true done cleanup_export_vm @@ -379,6 +379,120 @@ grep -q 'smoke-committed.txt' "$base_patch" \ "$BANGER" vm delete smoke-basecommit >/dev/null || die 'export base: delete failed' +# --- workspace + stop/start lifecycle --------------------------------- +log 'workspace + stop/start: prepare → stop → start must yield a usable rootfs and preserve workspace contents' +"$BANGER" vm create --name smoke-wsrestart >/dev/null \ + || die 'workspace stop/start: create failed' +"$BANGER" vm workspace prepare smoke-wsrestart "$repodir" >/dev/null \ + || die 'workspace stop/start: prepare failed' + +# Sanity: marker is present before the stop/start cycle. +pre_out="$("$BANGER" vm ssh smoke-wsrestart -- cat /root/repo/smoke-file.txt)" \ + || die 'workspace stop/start: pre-cycle ssh read failed' +grep -q 'smoke-workspace-marker' <<<"$pre_out" \ + || die "workspace stop/start: marker missing pre-cycle: $pre_out" + +"$BANGER" vm stop smoke-wsrestart >/dev/null \ + || die 'workspace stop/start: stop failed' +"$BANGER" vm start smoke-wsrestart >/dev/null \ + || die 'workspace stop/start: start after stop failed (rootfs corrupt?)' +wait_for_ssh smoke-wsrestart \ + || die 'workspace stop/start: ssh did not come up after restart' + +post_out="$("$BANGER" vm ssh smoke-wsrestart -- cat /root/repo/smoke-file.txt)" \ + || die 'workspace stop/start: post-cycle ssh read failed' +grep -q 'smoke-workspace-marker' <<<"$post_out" \ + || die "workspace stop/start: marker lost across stop/start: $post_out" + +"$BANGER" vm delete smoke-wsrestart >/dev/null \ + || die 'workspace stop/start: delete failed' + +# --- vm exec (workspace-aware, dirty detection, auto-prepare) --------- +log 'vm exec: cd into prepared workspace, exit-code propagation, stale-warn, --auto-prepare resync' +"$BANGER" vm create --name smoke-exec >/dev/null || die 'vm exec: create failed' +"$BANGER" vm workspace prepare smoke-exec "$repodir" >/dev/null \ + || die 'vm exec: workspace prepare failed' + +# WORKSPACE column populated in vm show after prepare. +show_out="$("$BANGER" vm show smoke-exec)" || die 'vm exec: vm show after prepare failed' +grep -q '"guest_path": "/root/repo"' <<<"$show_out" \ + || die "vm exec: workspace.guest_path not persisted on VM record: $show_out" + +# Basic happy path: cd happens, file is read from the workspace. +exec_cat="$("$BANGER" vm exec smoke-exec -- cat smoke-file.txt)" \ + || die "vm exec: cat smoke-file.txt failed" +grep -q 'smoke-workspace-marker' <<<"$exec_cat" \ + || die "vm exec: stdout missing workspace marker: $exec_cat" + +# pwd confirms the auto-cd into the prepared guest path. +exec_pwd="$("$BANGER" vm exec smoke-exec -- pwd | tr -d '[:space:]')" \ + || die 'vm exec: pwd failed' +[[ "$exec_pwd" == "/root/repo" ]] \ + || die "vm exec: pwd got '$exec_pwd', want '/root/repo' (auto-cd didn't happen)" + +# Exit-code propagation: 17 must come back as 17, verbatim. +set +e +"$BANGER" vm exec smoke-exec -- sh -c 'exit 17' >/dev/null 2>&1 +rc=$? +set -e +[[ "$rc" -eq 17 ]] || die "vm exec: exit-code propagation got rc=$rc, want 17" + +# Dirty detection: advance host HEAD, run `vm exec` without --auto-prepare, +# expect a stale-workspace warning on stderr and the new file NOT present in +# the guest (workspace was not re-synced). +( + cd "$repodir" + echo 'post-prepare-marker' > smoke-exec-new.txt + git add smoke-exec-new.txt + git commit -q -m 'add smoke-exec-new.txt after prepare' +) +stale_stderr="$runtime_dir/smoke-exec-stale.err" +set +e +"$BANGER" vm exec smoke-exec -- ls smoke-exec-new.txt >/dev/null 2>"$stale_stderr" +ls_rc=$? +set -e +[[ "$ls_rc" -ne 0 ]] \ + || die 'vm exec: stale workspace unexpectedly already had the new file (dirty path didn'"'"'t take effect)' +grep -q 'workspace stale' "$stale_stderr" \ + || die "vm exec: stale-workspace warning missing on stderr; got: $(cat "$stale_stderr")" +grep -q -- '--auto-prepare' "$stale_stderr" \ + || die "vm exec: stale warning didn't mention --auto-prepare hint; got: $(cat "$stale_stderr")" + +# --auto-prepare: re-syncs workspace, then runs the command. New file appears. +auto_out="$("$BANGER" vm exec smoke-exec --auto-prepare -- cat smoke-exec-new.txt)" \ + || die 'vm exec: --auto-prepare run failed' +grep -q 'post-prepare-marker' <<<"$auto_out" \ + || die "vm exec: --auto-prepare didn't re-sync new file; got: $auto_out" + +# After auto-prepare, the warning must NOT reappear on the next exec — +# stored HEAD should now match the host. +clean_stderr="$runtime_dir/smoke-exec-clean.err" +"$BANGER" vm exec smoke-exec -- true 2>"$clean_stderr" \ + || die 'vm exec: post-auto-prepare exec failed' +if grep -q 'workspace stale' "$clean_stderr"; then + die "vm exec: stale warning persisted after --auto-prepare; got: $(cat "$clean_stderr")" +fi + +# Reset repo state so later sections see the original tree. +( + cd "$repodir" + git reset --hard HEAD~1 -q +) + +# Refusal when VM is not running: exec on a stopped VM must error out +# with a clear "not running" message. Done last so we can delete from +# the stopped state without needing a restart. +"$BANGER" vm stop smoke-exec >/dev/null || die 'vm exec: stop for not-running test failed' +set +e +stopped_err="$("$BANGER" vm exec smoke-exec -- true 2>&1)" +rc=$? +set -e +[[ "$rc" -ne 0 ]] || die 'vm exec: exec on stopped VM unexpectedly succeeded' +grep -q 'not running' <<<"$stopped_err" \ + || die "vm exec: stopped-VM error missing 'not running' phrase: $stopped_err" + +"$BANGER" vm delete smoke-exec >/dev/null || die 'vm exec: delete failed' + # --- ssh-config install / uninstall (HOME-isolated) ------------------- log 'ssh-config --install / --uninstall: idempotent, survives round-trip' fake_home="$scratch_root/fake-home"