banger/internal/daemon/fcproc/findpid_jailer_test.go
Thales Maciel cec7291184
Survive banger update with running VMs
Two coupled fixes that together make the daemon-restart path of
`banger update` non-destructive for running guests:

1. Unit templates set `KillMode=process` on bangerd.service and
   bangerd-root.service. The default control-group behaviour sent
   SIGKILL to every process in the cgroup on stop/restart — including
   jailer-spawned firecracker children, since fork/exec doesn't
   escape a systemd cgroup. With process mode only the unit's main
   PID is signalled; FC children stay alive in the (unowned)
   cgroup until the new helper instance starts up and re-claims them.

2. `fcproc.FindPID` falls back to the jailer-written pidfile at
   `<chroot>/firecracker.pid` (sibling of the api-sock target) when
   `pgrep -n -f <api-sock>` doesn't find a match. pgrep can't see
   jailer'd FCs because their cmdline only carries the chroot-relative
   `--api-sock /firecracker.socket`, not the host-side path. The
   pidfile is jailer's actual record of the post-exec FC PID, so
   reconcile can verify the surviving process is the right one
   (comm == "firecracker") and re-seed handles.json without tearing
   down the VM's dm-snapshot.

Verified live on the dev host: started a VM, restarted the helper
unit, restarted the daemon unit, and confirmed the FC PID was
unchanged, vm list still showed the guest as running, and
`banger vm ssh` returned the same boot_id pre and post restart.
The systemd journal now reports "firecracker remains running after
unit stopped" and "Found left-over process X (firecracker) in
control group while starting unit. Ignoring." — exactly the shape
`KillMode=process` is supposed to produce.

Tests cover both the parser (parseVersionOutput from the v0.1.2
fix) and the new pidfile lookup: happy path, missing pidfile,
stale pid, wrong comm, garbage content, non-symlink api-sock,
whitespace tolerance.

CHANGELOG corrects v0.1.0's misleading "daemon restarts do not
interrupt running guests" line and documents the unit-refresh
caveat: existing v0.1.0–v0.1.3 installs need a one-time
`sudo banger system install` after updating to v0.1.4 to pick up
the new KillMode directive (`banger update` swaps binaries, not
unit files).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 17:09:15 -03:00

173 lines
5.3 KiB
Go

package fcproc
import (
"errors"
"fmt"
"os"
"path/filepath"
"testing"
)
// pidfileFixture builds the on-disk shape findByJailerPidfile inspects:
// a /proc-like tree (one entry per pid with comm), an api-sock symlink
// pointing into a faux chroot, and the chroot's firecracker.pid file.
type pidfileFixture struct {
root string
proc string
runtime string
chroots string
}
func newPidfileFixture(t *testing.T) *pidfileFixture {
t.Helper()
root := t.TempDir()
f := &pidfileFixture{
root: root,
proc: filepath.Join(root, "proc"),
runtime: filepath.Join(root, "runtime"),
chroots: filepath.Join(root, "chroots"),
}
for _, dir := range []string{f.proc, f.runtime, f.chroots} {
if err := os.MkdirAll(dir, 0o755); err != nil {
t.Fatalf("mkdir %s: %v", dir, err)
}
}
prev := procDir
procDir = f.proc
t.Cleanup(func() { procDir = prev })
return f
}
// addProc writes /proc/<pid>/comm. Mirrors the real /proc shape (comm
// has a trailing newline; production code TrimSpaces it).
func (f *pidfileFixture) addProc(t *testing.T, pid int, comm string) {
t.Helper()
pidDir := filepath.Join(f.proc, fmt.Sprint(pid))
if err := os.MkdirAll(pidDir, 0o755); err != nil {
t.Fatalf("mkdir %s: %v", pidDir, err)
}
if err := os.WriteFile(filepath.Join(pidDir, "comm"), []byte(comm+"\n"), 0o644); err != nil {
t.Fatalf("write comm: %v", err)
}
}
// buildVMSocket lays out the chroot for a VM and returns the api-sock
// path the test points findByJailerPidfile at. pidfileContent is what
// `cat <chroot>/firecracker.pid` will return — pass an empty string to
// skip writing the pidfile.
func (f *pidfileFixture) buildVMSocket(t *testing.T, vmid, pidfileContent string) (apiSock string) {
t.Helper()
chroot := filepath.Join(f.chroots, vmid, "root")
if err := os.MkdirAll(chroot, 0o755); err != nil {
t.Fatalf("mkdir chroot: %v", err)
}
socketTarget := filepath.Join(chroot, "firecracker.socket")
if err := os.WriteFile(socketTarget, nil, 0o600); err != nil {
t.Fatalf("write socket placeholder: %v", err)
}
if pidfileContent != "" {
if err := os.WriteFile(filepath.Join(chroot, "firecracker.pid"), []byte(pidfileContent), 0o600); err != nil {
t.Fatalf("write pidfile: %v", err)
}
}
apiSock = filepath.Join(f.runtime, "fc-"+vmid+".sock")
if err := os.Symlink(socketTarget, apiSock); err != nil {
t.Fatalf("symlink api sock: %v", err)
}
return apiSock
}
func TestFindByJailerPidfileHappyPath(t *testing.T) {
f := newPidfileFixture(t)
apiSock := f.buildVMSocket(t, "abc", "100\n")
f.addProc(t, 100, "firecracker")
got, err := findByJailerPidfile(apiSock)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if got != 100 {
t.Fatalf("pid = %d, want 100", got)
}
}
func TestFindByJailerPidfileMissingPidfile(t *testing.T) {
f := newPidfileFixture(t)
// VM exists in the chroot layout but no pidfile (e.g. VM was created
// but never started, or stopped and pidfile cleared).
apiSock := f.buildVMSocket(t, "abc", "")
_, err := findByJailerPidfile(apiSock)
if !errors.Is(err, errFirecrackerPIDNotFound) {
t.Fatalf("err = %v, want errFirecrackerPIDNotFound", err)
}
}
func TestFindByJailerPidfileStalePID(t *testing.T) {
f := newPidfileFixture(t)
// Pidfile points at a PID with no /proc entry — the FC died but the
// pidfile was left behind. Reconcile must treat this as "not running"
// so the rediscoverHandles path can mark the VM stopped cleanly.
apiSock := f.buildVMSocket(t, "abc", "100\n")
// Deliberately don't addProc(100, ...).
_, err := findByJailerPidfile(apiSock)
if !errors.Is(err, errFirecrackerPIDNotFound) {
t.Fatalf("err = %v, want errFirecrackerPIDNotFound", err)
}
}
func TestFindByJailerPidfileWrongComm(t *testing.T) {
f := newPidfileFixture(t)
// PID was recycled by the kernel and now belongs to some other
// process. The comm check is what catches this — pidfile content is
// untrusted across reboots / PID-wraparound.
apiSock := f.buildVMSocket(t, "abc", "100\n")
f.addProc(t, 100, "bash")
_, err := findByJailerPidfile(apiSock)
if !errors.Is(err, errFirecrackerPIDNotFound) {
t.Fatalf("err = %v, want errFirecrackerPIDNotFound", err)
}
}
func TestFindByJailerPidfileGarbageContent(t *testing.T) {
f := newPidfileFixture(t)
apiSock := f.buildVMSocket(t, "abc", "not-a-pid\n")
_, err := findByJailerPidfile(apiSock)
if !errors.Is(err, errFirecrackerPIDNotFound) {
t.Fatalf("err = %v, want errFirecrackerPIDNotFound", err)
}
}
func TestFindByJailerPidfileNonSymlinkApiSock(t *testing.T) {
f := newPidfileFixture(t)
// Direct (non-jailer) launches produce a regular-file api sock with
// no chroot beside it. Pidfile lookup can't help; fall through cleanly.
apiSock := filepath.Join(f.runtime, "direct-launch.sock")
if err := os.WriteFile(apiSock, nil, 0o600); err != nil {
t.Fatalf("write apiSock: %v", err)
}
_, err := findByJailerPidfile(apiSock)
if !errors.Is(err, errFirecrackerPIDNotFound) {
t.Fatalf("err = %v, want errFirecrackerPIDNotFound", err)
}
}
func TestFindByJailerPidfileTrimsWhitespace(t *testing.T) {
f := newPidfileFixture(t)
// Some FC versions write the pidfile with stray whitespace; the
// parser must tolerate it.
apiSock := f.buildVMSocket(t, "abc", " 100 \n\n")
f.addProc(t, 100, "firecracker")
got, err := findByJailerPidfile(apiSock)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if got != 100 {
t.Fatalf("pid = %d, want 100", got)
}
}