banger/internal/daemon/vm_lifecycle.go
Thales Maciel c9358ab390
daemon: sync guest over ssh before stop to preserve workspace writes
VM stop has been quietly losing data freshly written via
`vm workspace prepare`: stop+start of a workspace-prepared VM would
come back with /root/repo wiped on the work disk.

Root cause is firecracker + Debian's systemd defaults. FC's
SendCtrlAltDel (the only "graceful shutdown" action FC exposes) just
delivers the keystroke; what the guest does with it is its choice.
Debian routes ctrl-alt-del.target -> reboot.target, so the guest
reboots, FC stays alive, the daemon's 10s wait_for_exit window
expires, and the SIGKILL fallback drops anything still in FC's
userspace I/O path. For an idle VM that's invisible. For one that
just took 100s of small writes through a workspace prepare, it's
data loss.

Fix is to dial the guest over SSH inside StopVM and run
`sync; systemctl --no-block poweroff || /sbin/poweroff -f &` before
the existing SendCtrlAltDel path. The synchronous `sync` is the
load-bearing piece — it blocks until every dirty page hits virtio-blk
and lands in the on-host root.ext4. Whether poweroff completes
before SIGKILL fires is incidental; sync has already run. SSH
unreachable falls back to the old SendCtrlAltDel behaviour so a
broken-network guest can't make stop hang.

Bounded by a 5s SSH-dial timeout so a half-broken guest can't extend
the overall stop window past gracefulShutdownWait.

Also adds two smoke scenarios:
- `workspace + stop/start`: prepare -> stop -> start -> assert
  marker survives. This is the regression that caught the bug.
- `vm exec`: end-to-end coverage for d59425a — auto-cd into the
  prepared workspace, exit-code propagation, dirty-host warning,
  --auto-prepare resync, refusal on stopped VM.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-27 15:41:32 -03:00

359 lines
12 KiB
Go

package daemon
import (
"context"
"errors"
"io"
"net"
"os"
"path/filepath"
"strings"
"time"
"banger/internal/api"
"banger/internal/guest"
"banger/internal/model"
"banger/internal/system"
)
func (s *VMService) StartVM(ctx context.Context, idOrName string) (model.VMRecord, error) {
return s.withVMLockByRef(ctx, idOrName, func(vm model.VMRecord) (model.VMRecord, error) {
image, err := s.store.GetImageByID(ctx, vm.ImageID)
if err != nil {
return model.VMRecord{}, err
}
if s.vmAlive(vm) {
if s.logger != nil {
s.logger.Info("vm already running", vmLogAttrs(vm)...)
}
return vm, nil
}
return s.startVMLocked(ctx, vm, image)
})
}
func (s *VMService) startVMLocked(ctx context.Context, vm model.VMRecord, image model.Image) (_ model.VMRecord, err error) {
op := s.beginOperation(ctx, "vm.start", append(vmLogAttrs(vm), imageLogAttrs(image)...)...)
defer func() {
if err != nil {
err = annotateLogPath(err, vm.Runtime.LogPath)
op.fail(err, vmLogAttrs(vm)...)
return
}
op.done(vmLogAttrs(vm)...)
}()
// Derive per-VM paths/names up front so every step sees the same
// values. Shortening vm.ID mirrors how the pre-refactor inline
// code did it.
shortID := system.ShortID(vm.ID)
apiSock := filepath.Join(s.layout.RuntimeDir, "fc-"+shortID+".sock")
dmName := "fc-rootfs-" + shortID
tapName := "tap-fc-" + shortID
if strings.TrimSpace(vm.Runtime.VSockPath) == "" {
vm.Runtime.VSockPath = defaultVSockPath(s.layout.RuntimeDir, vm.ID)
}
if vm.Runtime.VSockCID == 0 {
vm.Runtime.VSockCID, err = defaultVSockCID(vm.Runtime.GuestIP)
if err != nil {
return model.VMRecord{}, err
}
}
live := model.VMHandles{}
sc := &startContext{
vm: &vm,
image: image,
live: &live,
apiSock: apiSock,
dmName: dmName,
tapName: tapName,
}
if runErr := s.runStartSteps(ctx, op, sc, s.buildStartSteps(op, sc)); runErr != nil {
// The step driver already ran rollback in reverse for every
// succeeded step. All that's left is to persist the ERROR
// state so operators see the failure via `vm show`. Use a
// fresh context in case the request ctx is cancelled — DB
// writes past this point are recovery, not user-driven.
//
// The store check is for tests that construct a bare Daemon
// without a DB; production always has s.store non-nil.
vm.State = model.VMStateError
vm.Runtime.State = model.VMStateError
vm.Runtime.LastError = runErr.Error()
clearRuntimeTeardownState(&vm)
s.clearVMHandles(vm)
if s.store != nil {
// We're in the recovery path: the start has already
// failed, and the user will see runErr. A persist
// failure here only affects what 'banger vm show'
// reads on the next call, so we keep returning runErr
// — but a silent swallow leaves operators unable to
// debug "why does the record still say running?". Log
// at warn instead.
if persistErr := s.store.UpsertVM(context.Background(), vm); persistErr != nil && s.logger != nil {
s.logger.Warn("persist vm error state failed", append(vmLogAttrs(vm), "error", persistErr.Error())...)
}
}
return model.VMRecord{}, runErr
}
return vm, nil
}
func (s *VMService) StopVM(ctx context.Context, idOrName string) (model.VMRecord, error) {
return s.withVMLockByRef(ctx, idOrName, func(vm model.VMRecord) (model.VMRecord, error) {
return s.stopVMLocked(ctx, vm)
})
}
func (s *VMService) stopVMLocked(ctx context.Context, current model.VMRecord) (vm model.VMRecord, err error) {
vm = current
op := s.beginOperation(ctx, "vm.stop", "vm_ref", vm.ID)
defer func() {
if err != nil {
op.fail(err, vmLogAttrs(vm)...)
return
}
op.done(vmLogAttrs(vm)...)
}()
if !s.vmAlive(vm) {
op.stage("cleanup_stale_runtime")
if err := s.cleanupRuntime(ctx, vm, true); err != nil {
return model.VMRecord{}, err
}
vm.State = model.VMStateStopped
vm.Runtime.State = model.VMStateStopped
clearRuntimeTeardownState(&vm)
s.clearVMHandles(vm)
if err := s.store.UpsertVM(ctx, vm); err != nil {
return model.VMRecord{}, err
}
return vm, nil
}
pid := s.vmHandles(vm.ID).PID
op.stage("graceful_shutdown")
// Reach into the guest over SSH to force a sync + queue a poweroff
// before falling back on FC's SendCtrlAltDel. The sync is what
// keeps stop() from losing data: every dirty page the guest hasn't
// flushed through virtio-blk to the work disk is written out
// before this RPC returns. Without it, files freshly created via
// `vm workspace prepare` can disappear across stop+start, because
// the 10-second wait_for_exit window expires (FC doesn't exit on
// SendCtrlAltDel — Debian routes ctrl-alt-del.target → reboot.target,
// not poweroff) and the fallback SIGKILL drops everything still
// in FC's userspace I/O path.
//
// `systemctl --no-block poweroff` is queued for the same reason
// SendCtrlAltDel was here originally — it's how stop() asks the
// guest to halt. That request is best-effort; FC may or may not
// exit before the SIGKILL fallback fires. Either way, sync
// already ran, so the on-host root.ext4 is consistent regardless.
//
// SendCtrlAltDel survives as a fallback for guests where SSH
// itself is unreachable (broken sshd, network down, drifted host
// key); it doesn't fix the data-loss path, but it's the existing
// last-resort signal and is at least no worse than today.
if err := s.requestGuestPoweroff(ctx, vm); err != nil {
if s.logger != nil {
s.logger.Warn("guest ssh poweroff failed; falling back to ctrl+alt+del",
append(vmLogAttrs(vm), "error", err.Error())...)
}
if fallbackErr := s.net.sendCtrlAltDel(ctx, vm.Runtime.APISockPath); fallbackErr != nil {
return model.VMRecord{}, fallbackErr
}
}
op.stage("wait_for_exit", "pid", pid)
if err := s.net.waitForExit(ctx, pid, vm.Runtime.APISockPath, gracefulShutdownWait); err != nil {
if !errors.Is(err, errWaitForExitTimeout) {
return model.VMRecord{}, err
}
op.stage("graceful_shutdown_timeout", "pid", pid)
}
op.stage("cleanup_runtime")
if err := s.cleanupRuntime(ctx, vm, true); err != nil {
return model.VMRecord{}, err
}
vm.State = model.VMStateStopped
vm.Runtime.State = model.VMStateStopped
clearRuntimeTeardownState(&vm)
s.clearVMHandles(vm)
system.TouchNow(&vm)
if err := s.store.UpsertVM(ctx, vm); err != nil {
return model.VMRecord{}, err
}
return vm, nil
}
// requestGuestPoweroff dials the guest over SSH and runs a sync +
// queues a poweroff job. The sync is the load-bearing piece — see the
// comment in stopVMLocked. Returns the dial / SSH error if the guest
// is unreachable; the caller treats that as a fallback signal.
//
// Bounded by a hard 5-second SSH-dial timeout so a half-broken guest
// doesn't extend the overall stop window past the existing
// gracefulShutdownWait. If the dial doesn't succeed in that window we
// surface an error and let the caller take the SendCtrlAltDel path.
func (s *VMService) requestGuestPoweroff(ctx context.Context, vm model.VMRecord) error {
guestIP := strings.TrimSpace(vm.Runtime.GuestIP)
if guestIP == "" {
return errors.New("guest IP unknown")
}
dialCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
defer cancel()
address := net.JoinHostPort(guestIP, "22")
client, err := guest.Dial(dialCtx, address, s.config.SSHKeyPath, s.layout.KnownHostsPath)
if err != nil {
return err
}
defer client.Close()
// `sync` runs synchronously and blocks RunScript until every dirty
// page hits virtio-blk → root.ext4. That's the persistence
// guarantee. The `systemctl --no-block poweroff` queues a job and
// returns; whether poweroff.target completes before the SIGKILL
// fallback fires is incidental — by then sync has already done
// its work. The `|| /sbin/poweroff -f` is the last-ditch fallback
// when systemd itself is wedged.
const script = "sync; systemctl --no-block poweroff || /sbin/poweroff -f &"
return client.RunScript(ctx, script, io.Discard)
}
func (s *VMService) KillVM(ctx context.Context, params api.VMKillParams) (model.VMRecord, error) {
return s.withVMLockByRef(ctx, params.IDOrName, func(vm model.VMRecord) (model.VMRecord, error) {
return s.killVMLocked(ctx, vm, params.Signal)
})
}
func (s *VMService) killVMLocked(ctx context.Context, current model.VMRecord, signalValue string) (vm model.VMRecord, err error) {
vm = current
op := s.beginOperation(ctx, "vm.kill", "vm_ref", vm.ID, "signal", signalValue)
defer func() {
if err != nil {
op.fail(err, vmLogAttrs(vm)...)
return
}
op.done(vmLogAttrs(vm)...)
}()
if !s.vmAlive(vm) {
op.stage("cleanup_stale_runtime")
if err := s.cleanupRuntime(ctx, vm, true); err != nil {
return model.VMRecord{}, err
}
vm.State = model.VMStateStopped
vm.Runtime.State = model.VMStateStopped
clearRuntimeTeardownState(&vm)
s.clearVMHandles(vm)
if err := s.store.UpsertVM(ctx, vm); err != nil {
return model.VMRecord{}, err
}
return vm, nil
}
signal := strings.TrimSpace(signalValue)
if signal == "" {
signal = "TERM"
}
pid := s.vmHandles(vm.ID).PID
op.stage("send_signal", "pid", pid, "signal", signal)
if err := s.privOps().SignalProcess(ctx, pid, signal); err != nil {
return model.VMRecord{}, err
}
op.stage("wait_for_exit", "pid", pid)
if err := s.net.waitForExit(ctx, pid, vm.Runtime.APISockPath, 30*time.Second); err != nil {
if !errors.Is(err, errWaitForExitTimeout) {
return model.VMRecord{}, err
}
op.stage("signal_timeout", "pid", pid, "signal", signal)
}
op.stage("cleanup_runtime")
if err := s.cleanupRuntime(ctx, vm, true); err != nil {
return model.VMRecord{}, err
}
vm.State = model.VMStateStopped
vm.Runtime.State = model.VMStateStopped
clearRuntimeTeardownState(&vm)
s.clearVMHandles(vm)
system.TouchNow(&vm)
if err := s.store.UpsertVM(ctx, vm); err != nil {
return model.VMRecord{}, err
}
return vm, nil
}
func (s *VMService) RestartVM(ctx context.Context, idOrName string) (vm model.VMRecord, err error) {
op := s.beginOperation(ctx, "vm.restart", "vm_ref", idOrName)
defer func() {
if err != nil {
op.fail(err, vmLogAttrs(vm)...)
return
}
op.done(vmLogAttrs(vm)...)
}()
resolved, err := s.FindVM(ctx, idOrName)
if err != nil {
return model.VMRecord{}, err
}
return s.withVMLockByID(ctx, resolved.ID, func(vm model.VMRecord) (model.VMRecord, error) {
op.stage("stop")
vm, err = s.stopVMLocked(ctx, vm)
if err != nil {
return model.VMRecord{}, err
}
image, err := s.store.GetImageByID(ctx, vm.ImageID)
if err != nil {
return model.VMRecord{}, err
}
op.stage("start", vmLogAttrs(vm)...)
return s.startVMLocked(ctx, vm, image)
})
}
func (s *VMService) DeleteVM(ctx context.Context, idOrName string) (model.VMRecord, error) {
return s.withVMLockByRef(ctx, idOrName, func(vm model.VMRecord) (model.VMRecord, error) {
return s.deleteVMLocked(ctx, vm)
})
}
func (s *VMService) deleteVMLocked(ctx context.Context, current model.VMRecord) (vm model.VMRecord, err error) {
vm = current
op := s.beginOperation(ctx, "vm.delete", "vm_ref", vm.ID)
defer func() {
if err != nil {
op.fail(err, vmLogAttrs(vm)...)
return
}
op.done(vmLogAttrs(vm)...)
}()
if s.vmAlive(vm) {
pid := s.vmHandles(vm.ID).PID
op.stage("kill_running_vm", "pid", pid)
// Best-effort: cleanupRuntime below tears the process down
// regardless. A kill failure here only matters when it
// surfaces something operators should see (permission
// denied, etc.), so promote it from a silent _ to a Warn
// without changing the control flow.
if killErr := s.net.killVMProcess(ctx, pid); killErr != nil && s.logger != nil {
s.logger.Warn("kill vm process during delete failed", append(vmLogAttrs(vm), "pid", pid, "error", killErr.Error())...)
}
}
op.stage("cleanup_runtime")
if err := s.cleanupRuntime(ctx, vm, false); err != nil {
return model.VMRecord{}, err
}
clearRuntimeTeardownState(&vm)
op.stage("delete_store_record")
if err := s.store.DeleteVM(ctx, vm.ID); err != nil {
return model.VMRecord{}, err
}
if vm.Runtime.VMDir != "" {
op.stage("delete_vm_dir", "vm_dir", vm.Runtime.VMDir)
if err := os.RemoveAll(vm.Runtime.VMDir); err != nil {
return model.VMRecord{}, err
}
}
// Drop any host-key pins. A future VM reusing this IP or name
// would otherwise trip the TOFU mismatch branch in
// TOFUHostKeyCallback and fail to connect.
removeVMKnownHosts(s.layout.KnownHostsPath, vm, s.logger)
return vm, nil
}