Survive banger update with running VMs
Two coupled fixes that together make the daemon-restart path of `banger update` non-destructive for running guests: 1. Unit templates set `KillMode=process` on bangerd.service and bangerd-root.service. The default control-group behaviour sent SIGKILL to every process in the cgroup on stop/restart — including jailer-spawned firecracker children, since fork/exec doesn't escape a systemd cgroup. With process mode only the unit's main PID is signalled; FC children stay alive in the (unowned) cgroup until the new helper instance starts up and re-claims them. 2. `fcproc.FindPID` falls back to the jailer-written pidfile at `<chroot>/firecracker.pid` (sibling of the api-sock target) when `pgrep -n -f <api-sock>` doesn't find a match. pgrep can't see jailer'd FCs because their cmdline only carries the chroot-relative `--api-sock /firecracker.socket`, not the host-side path. The pidfile is jailer's actual record of the post-exec FC PID, so reconcile can verify the surviving process is the right one (comm == "firecracker") and re-seed handles.json without tearing down the VM's dm-snapshot. Verified live on the dev host: started a VM, restarted the helper unit, restarted the daemon unit, and confirmed the FC PID was unchanged, vm list still showed the guest as running, and `banger vm ssh` returned the same boot_id pre and post restart. The systemd journal now reports "firecracker remains running after unit stopped" and "Found left-over process X (firecracker) in control group while starting unit. Ignoring." — exactly the shape `KillMode=process` is supposed to produce. Tests cover both the parser (parseVersionOutput from the v0.1.2 fix) and the new pidfile lookup: happy path, missing pidfile, stale pid, wrong comm, garbage content, non-symlink api-sock, whitespace tolerance. CHANGELOG corrects v0.1.0's misleading "daemon restarts do not interrupt running guests" line and documents the unit-refresh caveat: existing v0.1.0–v0.1.3 installs need a one-time `sudo banger system install` after updating to v0.1.4 to pick up the new KillMode directive (`banger update` swaps binaries, not unit files). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
9c2e6a4647
commit
cec7291184
5 changed files with 310 additions and 3 deletions
173
internal/daemon/fcproc/findpid_jailer_test.go
Normal file
173
internal/daemon/fcproc/findpid_jailer_test.go
Normal file
|
|
@ -0,0 +1,173 @@
|
|||
package fcproc
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// pidfileFixture builds the on-disk shape findByJailerPidfile inspects:
|
||||
// a /proc-like tree (one entry per pid with comm), an api-sock symlink
|
||||
// pointing into a faux chroot, and the chroot's firecracker.pid file.
|
||||
type pidfileFixture struct {
|
||||
root string
|
||||
proc string
|
||||
runtime string
|
||||
chroots string
|
||||
}
|
||||
|
||||
func newPidfileFixture(t *testing.T) *pidfileFixture {
|
||||
t.Helper()
|
||||
root := t.TempDir()
|
||||
f := &pidfileFixture{
|
||||
root: root,
|
||||
proc: filepath.Join(root, "proc"),
|
||||
runtime: filepath.Join(root, "runtime"),
|
||||
chroots: filepath.Join(root, "chroots"),
|
||||
}
|
||||
for _, dir := range []string{f.proc, f.runtime, f.chroots} {
|
||||
if err := os.MkdirAll(dir, 0o755); err != nil {
|
||||
t.Fatalf("mkdir %s: %v", dir, err)
|
||||
}
|
||||
}
|
||||
prev := procDir
|
||||
procDir = f.proc
|
||||
t.Cleanup(func() { procDir = prev })
|
||||
return f
|
||||
}
|
||||
|
||||
// addProc writes /proc/<pid>/comm. Mirrors the real /proc shape (comm
|
||||
// has a trailing newline; production code TrimSpaces it).
|
||||
func (f *pidfileFixture) addProc(t *testing.T, pid int, comm string) {
|
||||
t.Helper()
|
||||
pidDir := filepath.Join(f.proc, fmt.Sprint(pid))
|
||||
if err := os.MkdirAll(pidDir, 0o755); err != nil {
|
||||
t.Fatalf("mkdir %s: %v", pidDir, err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(pidDir, "comm"), []byte(comm+"\n"), 0o644); err != nil {
|
||||
t.Fatalf("write comm: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// buildVMSocket lays out the chroot for a VM and returns the api-sock
|
||||
// path the test points findByJailerPidfile at. pidfileContent is what
|
||||
// `cat <chroot>/firecracker.pid` will return — pass an empty string to
|
||||
// skip writing the pidfile.
|
||||
func (f *pidfileFixture) buildVMSocket(t *testing.T, vmid, pidfileContent string) (apiSock string) {
|
||||
t.Helper()
|
||||
chroot := filepath.Join(f.chroots, vmid, "root")
|
||||
if err := os.MkdirAll(chroot, 0o755); err != nil {
|
||||
t.Fatalf("mkdir chroot: %v", err)
|
||||
}
|
||||
socketTarget := filepath.Join(chroot, "firecracker.socket")
|
||||
if err := os.WriteFile(socketTarget, nil, 0o600); err != nil {
|
||||
t.Fatalf("write socket placeholder: %v", err)
|
||||
}
|
||||
if pidfileContent != "" {
|
||||
if err := os.WriteFile(filepath.Join(chroot, "firecracker.pid"), []byte(pidfileContent), 0o600); err != nil {
|
||||
t.Fatalf("write pidfile: %v", err)
|
||||
}
|
||||
}
|
||||
apiSock = filepath.Join(f.runtime, "fc-"+vmid+".sock")
|
||||
if err := os.Symlink(socketTarget, apiSock); err != nil {
|
||||
t.Fatalf("symlink api sock: %v", err)
|
||||
}
|
||||
return apiSock
|
||||
}
|
||||
|
||||
func TestFindByJailerPidfileHappyPath(t *testing.T) {
|
||||
f := newPidfileFixture(t)
|
||||
apiSock := f.buildVMSocket(t, "abc", "100\n")
|
||||
f.addProc(t, 100, "firecracker")
|
||||
|
||||
got, err := findByJailerPidfile(apiSock)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if got != 100 {
|
||||
t.Fatalf("pid = %d, want 100", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindByJailerPidfileMissingPidfile(t *testing.T) {
|
||||
f := newPidfileFixture(t)
|
||||
// VM exists in the chroot layout but no pidfile (e.g. VM was created
|
||||
// but never started, or stopped and pidfile cleared).
|
||||
apiSock := f.buildVMSocket(t, "abc", "")
|
||||
|
||||
_, err := findByJailerPidfile(apiSock)
|
||||
if !errors.Is(err, errFirecrackerPIDNotFound) {
|
||||
t.Fatalf("err = %v, want errFirecrackerPIDNotFound", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindByJailerPidfileStalePID(t *testing.T) {
|
||||
f := newPidfileFixture(t)
|
||||
// Pidfile points at a PID with no /proc entry — the FC died but the
|
||||
// pidfile was left behind. Reconcile must treat this as "not running"
|
||||
// so the rediscoverHandles path can mark the VM stopped cleanly.
|
||||
apiSock := f.buildVMSocket(t, "abc", "100\n")
|
||||
// Deliberately don't addProc(100, ...).
|
||||
|
||||
_, err := findByJailerPidfile(apiSock)
|
||||
if !errors.Is(err, errFirecrackerPIDNotFound) {
|
||||
t.Fatalf("err = %v, want errFirecrackerPIDNotFound", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindByJailerPidfileWrongComm(t *testing.T) {
|
||||
f := newPidfileFixture(t)
|
||||
// PID was recycled by the kernel and now belongs to some other
|
||||
// process. The comm check is what catches this — pidfile content is
|
||||
// untrusted across reboots / PID-wraparound.
|
||||
apiSock := f.buildVMSocket(t, "abc", "100\n")
|
||||
f.addProc(t, 100, "bash")
|
||||
|
||||
_, err := findByJailerPidfile(apiSock)
|
||||
if !errors.Is(err, errFirecrackerPIDNotFound) {
|
||||
t.Fatalf("err = %v, want errFirecrackerPIDNotFound", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindByJailerPidfileGarbageContent(t *testing.T) {
|
||||
f := newPidfileFixture(t)
|
||||
apiSock := f.buildVMSocket(t, "abc", "not-a-pid\n")
|
||||
|
||||
_, err := findByJailerPidfile(apiSock)
|
||||
if !errors.Is(err, errFirecrackerPIDNotFound) {
|
||||
t.Fatalf("err = %v, want errFirecrackerPIDNotFound", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindByJailerPidfileNonSymlinkApiSock(t *testing.T) {
|
||||
f := newPidfileFixture(t)
|
||||
// Direct (non-jailer) launches produce a regular-file api sock with
|
||||
// no chroot beside it. Pidfile lookup can't help; fall through cleanly.
|
||||
apiSock := filepath.Join(f.runtime, "direct-launch.sock")
|
||||
if err := os.WriteFile(apiSock, nil, 0o600); err != nil {
|
||||
t.Fatalf("write apiSock: %v", err)
|
||||
}
|
||||
|
||||
_, err := findByJailerPidfile(apiSock)
|
||||
if !errors.Is(err, errFirecrackerPIDNotFound) {
|
||||
t.Fatalf("err = %v, want errFirecrackerPIDNotFound", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindByJailerPidfileTrimsWhitespace(t *testing.T) {
|
||||
f := newPidfileFixture(t)
|
||||
// Some FC versions write the pidfile with stray whitespace; the
|
||||
// parser must tolerate it.
|
||||
apiSock := f.buildVMSocket(t, "abc", " 100 \n\n")
|
||||
f.addProc(t, 100, "firecracker")
|
||||
|
||||
got, err := findByJailerPidfile(apiSock)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if got != 100 {
|
||||
t.Fatalf("pid = %d, want 100", got)
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue