Survive banger update with running VMs
Two coupled fixes that together make the daemon-restart path of `banger update` non-destructive for running guests: 1. Unit templates set `KillMode=process` on bangerd.service and bangerd-root.service. The default control-group behaviour sent SIGKILL to every process in the cgroup on stop/restart — including jailer-spawned firecracker children, since fork/exec doesn't escape a systemd cgroup. With process mode only the unit's main PID is signalled; FC children stay alive in the (unowned) cgroup until the new helper instance starts up and re-claims them. 2. `fcproc.FindPID` falls back to the jailer-written pidfile at `<chroot>/firecracker.pid` (sibling of the api-sock target) when `pgrep -n -f <api-sock>` doesn't find a match. pgrep can't see jailer'd FCs because their cmdline only carries the chroot-relative `--api-sock /firecracker.socket`, not the host-side path. The pidfile is jailer's actual record of the post-exec FC PID, so reconcile can verify the surviving process is the right one (comm == "firecracker") and re-seed handles.json without tearing down the VM's dm-snapshot. Verified live on the dev host: started a VM, restarted the helper unit, restarted the daemon unit, and confirmed the FC PID was unchanged, vm list still showed the guest as running, and `banger vm ssh` returned the same boot_id pre and post restart. The systemd journal now reports "firecracker remains running after unit stopped" and "Found left-over process X (firecracker) in control group while starting unit. Ignoring." — exactly the shape `KillMode=process` is supposed to produce. Tests cover both the parser (parseVersionOutput from the v0.1.2 fix) and the new pidfile lookup: happy path, missing pidfile, stale pid, wrong comm, garbage content, non-symlink api-sock, whitespace tolerance. CHANGELOG corrects v0.1.0's misleading "daemon restarts do not interrupt running guests" line and documents the unit-refresh caveat: existing v0.1.0–v0.1.3 installs need a one-time `sudo banger system install` after updating to v0.1.4 to pick up the new KillMode directive (`banger update` swaps binaries, not unit files). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
9c2e6a4647
commit
cec7291184
5 changed files with 310 additions and 3 deletions
|
|
@ -25,6 +25,16 @@ import (
|
|||
"banger/internal/system"
|
||||
)
|
||||
|
||||
// errFirecrackerPIDNotFound is returned by findByJailerPidfile when the
|
||||
// pidfile is missing, unreadable, or doesn't point at a live firecracker
|
||||
// process. Surfaces to callers as a "this VM isn't running" signal, not
|
||||
// as a hard failure.
|
||||
var errFirecrackerPIDNotFound = errors.New("firecracker pid not found")
|
||||
|
||||
// procDir is the kernel's per-process inspection directory. Var so tests
|
||||
// can swap in a fake /proc-shaped fixture in t.TempDir().
|
||||
var procDir = "/proc"
|
||||
|
||||
// ErrWaitForExitTimeout is returned by WaitForExit when the deadline passes
|
||||
// before the process exits. Callers use errors.Is to detect it.
|
||||
var ErrWaitForExitTimeout = errors.New("timed out waiting for VM to exit")
|
||||
|
|
@ -256,9 +266,35 @@ func chownChmodNoFollow(ctx context.Context, runner Runner, path string, uid, gi
|
|||
return nil
|
||||
}
|
||||
|
||||
// FindPID returns the PID of the firecracker process listening on apiSock,
|
||||
// located via pgrep.
|
||||
// FindPID returns the PID of the firecracker process backing apiSock.
|
||||
//
|
||||
// Two strategies, tried in order:
|
||||
//
|
||||
// 1. pgrep -n -f apiSock — cheap, works for direct (non-jailer) launches
|
||||
// because the host-side socket path appears verbatim in firecracker's
|
||||
// cmdline.
|
||||
// 2. Jailer pidfile — for jailer'd firecrackers, pgrep can't match
|
||||
// because the cmdline only carries the chroot-relative
|
||||
// `--api-sock /firecracker.socket`. Jailer (v1.x) writes the
|
||||
// post-exec firecracker PID to `<chroot>/firecracker.pid` by default.
|
||||
// Read it; verify the PID is alive and its comm is `firecracker`.
|
||||
// Caller must run with read access to the pidfile (root in the
|
||||
// system-mode helper; daemon UID in dev mode where banger doesn't
|
||||
// drop privs).
|
||||
//
|
||||
// This is what makes post-restart reconcile re-attach to surviving
|
||||
// guests instead of mistaking them for stale.
|
||||
func (m *Manager) FindPID(ctx context.Context, apiSock string) (int, error) {
|
||||
if pid, err := m.findPIDByPgrep(ctx, apiSock); err == nil && pid > 0 {
|
||||
return pid, nil
|
||||
}
|
||||
if pid, err := findByJailerPidfile(apiSock); err == nil && pid > 0 {
|
||||
return pid, nil
|
||||
}
|
||||
return 0, errFirecrackerPIDNotFound
|
||||
}
|
||||
|
||||
func (m *Manager) findPIDByPgrep(ctx context.Context, apiSock string) (int, error) {
|
||||
out, err := m.runner.Run(ctx, "pgrep", "-n", "-f", apiSock)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
|
|
@ -266,6 +302,43 @@ func (m *Manager) FindPID(ctx context.Context, apiSock string) (int, error) {
|
|||
return strconv.Atoi(strings.TrimSpace(string(out)))
|
||||
}
|
||||
|
||||
// findByJailerPidfile reads the jailer-written pidfile that lives at
|
||||
// `<chroot>/firecracker.pid` (sibling of the api socket inside the
|
||||
// chroot), verifies the PID is alive and its /proc/<pid>/comm is
|
||||
// `firecracker`, and returns it.
|
||||
//
|
||||
// Returns errFirecrackerPIDNotFound when the api-sock isn't a symlink
|
||||
// (direct launch — pidfile shape doesn't apply), the pidfile is
|
||||
// missing or unreadable (VM stopped, or caller lacks privileges),
|
||||
// the pidfile content is garbage, or the PID points at a process
|
||||
// that's gone or never was firecracker.
|
||||
func findByJailerPidfile(apiSock string) (int, error) {
|
||||
target, err := os.Readlink(apiSock)
|
||||
if err != nil {
|
||||
return 0, errFirecrackerPIDNotFound
|
||||
}
|
||||
if !filepath.IsAbs(target) {
|
||||
target = filepath.Join(filepath.Dir(apiSock), target)
|
||||
}
|
||||
pidPath := filepath.Join(filepath.Dir(target), "firecracker.pid")
|
||||
pidBytes, err := os.ReadFile(pidPath)
|
||||
if err != nil {
|
||||
return 0, errFirecrackerPIDNotFound
|
||||
}
|
||||
pid, err := strconv.Atoi(strings.TrimSpace(string(pidBytes)))
|
||||
if err != nil || pid <= 0 {
|
||||
return 0, errFirecrackerPIDNotFound
|
||||
}
|
||||
commBytes, err := os.ReadFile(filepath.Join(procDir, strconv.Itoa(pid), "comm"))
|
||||
if err != nil {
|
||||
return 0, errFirecrackerPIDNotFound
|
||||
}
|
||||
if strings.TrimSpace(string(commBytes)) != "firecracker" {
|
||||
return 0, errFirecrackerPIDNotFound
|
||||
}
|
||||
return pid, nil
|
||||
}
|
||||
|
||||
// ResolvePID prefers pgrep and falls back to the firecracker machine PID.
|
||||
// Returns 0 if neither source yields a PID.
|
||||
func (m *Manager) ResolvePID(ctx context.Context, machine *firecracker.Machine, apiSock string) int {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue