daemon: sync guest over ssh before stop to preserve workspace writes
VM stop has been quietly losing data freshly written via
`vm workspace prepare`: stop+start of a workspace-prepared VM would
come back with /root/repo wiped on the work disk.
Root cause is firecracker + Debian's systemd defaults. FC's
SendCtrlAltDel (the only "graceful shutdown" action FC exposes) just
delivers the keystroke; what the guest does with it is its choice.
Debian routes ctrl-alt-del.target -> reboot.target, so the guest
reboots, FC stays alive, the daemon's 10s wait_for_exit window
expires, and the SIGKILL fallback drops anything still in FC's
userspace I/O path. For an idle VM that's invisible. For one that
just took 100s of small writes through a workspace prepare, it's
data loss.
Fix is to dial the guest over SSH inside StopVM and run
`sync; systemctl --no-block poweroff || /sbin/poweroff -f &` before
the existing SendCtrlAltDel path. The synchronous `sync` is the
load-bearing piece — it blocks until every dirty page hits virtio-blk
and lands in the on-host root.ext4. Whether poweroff completes
before SIGKILL fires is incidental; sync has already run. SSH
unreachable falls back to the old SendCtrlAltDel behaviour so a
broken-network guest can't make stop hang.
Bounded by a 5s SSH-dial timeout so a half-broken guest can't extend
the overall stop window past gracefulShutdownWait.
Also adds two smoke scenarios:
- `workspace + stop/start`: prepare -> stop -> start -> assert
marker survives. This is the regression that caught the bug.
- `vm exec`: end-to-end coverage for d59425a — auto-cd into the
prepared workspace, exit-code propagation, dirty-host warning,
--auto-prepare resync, refusal on stopped VM.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
d59425adb9
commit
c9358ab390
2 changed files with 180 additions and 3 deletions
|
|
@ -3,12 +3,15 @@ package daemon
|
|||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"io"
|
||||
"net"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"banger/internal/api"
|
||||
"banger/internal/guest"
|
||||
"banger/internal/model"
|
||||
"banger/internal/system"
|
||||
)
|
||||
|
|
@ -130,8 +133,35 @@ func (s *VMService) stopVMLocked(ctx context.Context, current model.VMRecord) (v
|
|||
}
|
||||
pid := s.vmHandles(vm.ID).PID
|
||||
op.stage("graceful_shutdown")
|
||||
if err := s.net.sendCtrlAltDel(ctx, vm.Runtime.APISockPath); err != nil {
|
||||
return model.VMRecord{}, err
|
||||
// Reach into the guest over SSH to force a sync + queue a poweroff
|
||||
// before falling back on FC's SendCtrlAltDel. The sync is what
|
||||
// keeps stop() from losing data: every dirty page the guest hasn't
|
||||
// flushed through virtio-blk to the work disk is written out
|
||||
// before this RPC returns. Without it, files freshly created via
|
||||
// `vm workspace prepare` can disappear across stop+start, because
|
||||
// the 10-second wait_for_exit window expires (FC doesn't exit on
|
||||
// SendCtrlAltDel — Debian routes ctrl-alt-del.target → reboot.target,
|
||||
// not poweroff) and the fallback SIGKILL drops everything still
|
||||
// in FC's userspace I/O path.
|
||||
//
|
||||
// `systemctl --no-block poweroff` is queued for the same reason
|
||||
// SendCtrlAltDel was here originally — it's how stop() asks the
|
||||
// guest to halt. That request is best-effort; FC may or may not
|
||||
// exit before the SIGKILL fallback fires. Either way, sync
|
||||
// already ran, so the on-host root.ext4 is consistent regardless.
|
||||
//
|
||||
// SendCtrlAltDel survives as a fallback for guests where SSH
|
||||
// itself is unreachable (broken sshd, network down, drifted host
|
||||
// key); it doesn't fix the data-loss path, but it's the existing
|
||||
// last-resort signal and is at least no worse than today.
|
||||
if err := s.requestGuestPoweroff(ctx, vm); err != nil {
|
||||
if s.logger != nil {
|
||||
s.logger.Warn("guest ssh poweroff failed; falling back to ctrl+alt+del",
|
||||
append(vmLogAttrs(vm), "error", err.Error())...)
|
||||
}
|
||||
if fallbackErr := s.net.sendCtrlAltDel(ctx, vm.Runtime.APISockPath); fallbackErr != nil {
|
||||
return model.VMRecord{}, fallbackErr
|
||||
}
|
||||
}
|
||||
op.stage("wait_for_exit", "pid", pid)
|
||||
if err := s.net.waitForExit(ctx, pid, vm.Runtime.APISockPath, gracefulShutdownWait); err != nil {
|
||||
|
|
@ -155,6 +185,39 @@ func (s *VMService) stopVMLocked(ctx context.Context, current model.VMRecord) (v
|
|||
return vm, nil
|
||||
}
|
||||
|
||||
// requestGuestPoweroff dials the guest over SSH and runs a sync +
|
||||
// queues a poweroff job. The sync is the load-bearing piece — see the
|
||||
// comment in stopVMLocked. Returns the dial / SSH error if the guest
|
||||
// is unreachable; the caller treats that as a fallback signal.
|
||||
//
|
||||
// Bounded by a hard 5-second SSH-dial timeout so a half-broken guest
|
||||
// doesn't extend the overall stop window past the existing
|
||||
// gracefulShutdownWait. If the dial doesn't succeed in that window we
|
||||
// surface an error and let the caller take the SendCtrlAltDel path.
|
||||
func (s *VMService) requestGuestPoweroff(ctx context.Context, vm model.VMRecord) error {
|
||||
guestIP := strings.TrimSpace(vm.Runtime.GuestIP)
|
||||
if guestIP == "" {
|
||||
return errors.New("guest IP unknown")
|
||||
}
|
||||
dialCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
|
||||
defer cancel()
|
||||
address := net.JoinHostPort(guestIP, "22")
|
||||
client, err := guest.Dial(dialCtx, address, s.config.SSHKeyPath, s.layout.KnownHostsPath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer client.Close()
|
||||
// `sync` runs synchronously and blocks RunScript until every dirty
|
||||
// page hits virtio-blk → root.ext4. That's the persistence
|
||||
// guarantee. The `systemctl --no-block poweroff` queues a job and
|
||||
// returns; whether poweroff.target completes before the SIGKILL
|
||||
// fallback fires is incidental — by then sync has already done
|
||||
// its work. The `|| /sbin/poweroff -f` is the last-ditch fallback
|
||||
// when systemd itself is wedged.
|
||||
const script = "sync; systemctl --no-block poweroff || /sbin/poweroff -f &"
|
||||
return client.RunScript(ctx, script, io.Discard)
|
||||
}
|
||||
|
||||
func (s *VMService) KillVM(ctx context.Context, params api.VMKillParams) (model.VMRecord, error) {
|
||||
return s.withVMLockByRef(ctx, params.IDOrName, func(vm model.VMRecord) (model.VMRecord, error) {
|
||||
return s.killVMLocked(ctx, vm, params.Signal)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue