update: VMs survive banger update and rollback

Three load-bearing fixes that together let `banger update` (and its
auto-rollback path) restart the helper + daemon without killing
every running VM. New smoke scenarios prove the property end-to-end.

Bug fixes:

1. Disable the firecracker SDK's signal-forwarding goroutine. The
   default ForwardSignals = [SIGINT, SIGQUIT, SIGTERM, SIGHUP,
   SIGABRT] installs a handler in the helper that propagates the
   helper's SIGTERM (sent by systemd on `systemctl stop bangerd-
   root.service`) to every running firecracker child. Set
   ForwardSignals to an empty (non-nil) slice so setupSignals
   short-circuits at len()==0.

2. Add SendSIGKILL=no to bangerd-root.service. KillMode=process
   limits the initial SIGTERM to the helper main, but systemd
   still SIGKILLs leftover cgroup processes during the
   FinalKillSignal stage unless SendSIGKILL=no.

3. Route restart-helper / restart-daemon / wait-daemon-ready
   failures through rollbackAndRestart instead of rollbackAndWrap.
   rollbackAndWrap restored .previous binaries but didn't re-
   restart the failed unit, leaving the helper dead with the
   rolled-back binary on disk after a failed update.

Testing infrastructure (production binaries unaffected):

- Hidden --manifest-url and --pubkey-file flags on `banger update`
  let the smoke harness redirect the updater at locally-built
  release artefacts. Marked Hidden in cobra; not advertised in
  --help.
- FetchManifestFrom / VerifyBlobSignatureWithKey /
  FetchAndVerifySignatureWithKey export the existing logic against
  caller-supplied URL / pubkey. The default entry points still
  call them with the embedded canonical values.

Smoke scenarios:

- update_check: --check against fake manifest reports update
  available
- update_to_unknown: --to v9.9.9 fails before any host mutation
- update_no_root: refuses without sudo, install untouched
- update_dry_run: stages + verifies, no swap, version unchanged
- update_keeps_vm_alive: real swap to v0.smoke.0; same VM (same
  boot_id) answers SSH after the daemon restart
- update_rollback_keeps_vm_alive: v0.smoke.broken-bangerd ships a
  bangerd that passes --check-migrations but exits 1 as the
  daemon. The post-swap `systemctl restart bangerd` fails,
  rollbackAndRestart fires, the .previous binaries are restored
  and re-restarted; the same VM still answers SSH afterwards
- daemon_admin (separate prep): covers `banger daemon socket`,
  `bangerd --check-migrations --system`, `sudo banger daemon
  stop`

The smoke release builder generates a fresh ECDSA P-256 keypair
with openssl, signs SHA256SUMS cosign-compatibly, and serves
artefacts from a backgrounded python http.server.
verify_smoke_check_test.go pins the openssl/cosign signature
equivalence so the smoke release builder can't silently drift.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Thales Maciel 2026-05-01 12:08:08 -03:00
parent 7e528f30b3
commit 2606bfbabb
No known key found for this signature in database
GPG key ID: 33112E6833C34679
8 changed files with 609 additions and 50 deletions

View file

@ -364,18 +364,34 @@ func renderRootHelperSystemdUnit() string {
"ExecStart=" + systemBangerdBin + " --root-helper",
"Restart=on-failure",
"RestartSec=1s",
// KillMode=process is load-bearing: the helper unit's cgroup is
// where every banger-launched firecracker process lives (see
// validateFirecrackerPID). Without this, `systemctl restart
// bangerd-root.service` — which `banger update` runs — would
// SIGKILL every in-flight VM along with the helper because
// systemd's default KillMode=control-group nukes the whole cgroup.
// With process mode, only the helper PID is signaled; firecracker
// children survive, the new helper instance re-attaches via the
// helper RPC, daemon reconcile re-seeds in-memory state, VM keeps
// running. `banger system uninstall` and the daemon's vm-stop
// path explicitly stop firecracker processes when actually needed.
// KillMode=process + SendSIGKILL=no together make the helper
// safe to restart while banger-launched firecrackers are
// running. firecracker lives in this unit's cgroup (jailer
// doesn't open a sub-cgroup), so:
//
// - Default control-group mode SIGKILLs every process in
// the cgroup on stop.
// - KillMode=process limits the initial SIGTERM to the
// helper main PID; systemd leaves remaining cgroup
// processes alone (and logs "Unit process N (firecracker)
// remains running after unit stopped").
// - SendSIGKILL=no disables the FinalKillSignal escalation
// that would otherwise SIGKILL leftovers after the timeout.
//
// One more pitfall: the firecracker SDK installs a default
// signal-forwarding goroutine in the helper that catches
// SIGTERM (etc.) and forwards it to every firecracker child.
// We disable that explicitly via ForwardSignals: []os.Signal{}
// in firecracker.buildConfig — without that override, systemd
// signaling the helper main would propagate to every running
// VM regardless of what these directives do.
//
// `banger system uninstall` and the daemon's vm-stop path
// explicitly stop firecracker processes when actually needed,
// so we don't lose the systemd-driven kill as a real safety
// net — banger drives those kills itself.
"KillMode=process",
"SendSIGKILL=no",
"Environment=PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
"Environment=TMPDIR=" + installmeta.DefaultRootHelperRuntimeDir,
"UMask=0077",

View file

@ -30,10 +30,12 @@ const stagingTarballName = "release.tar.gz"
func (d *deps) newUpdateCommand() *cobra.Command {
var (
checkOnly bool
dryRun bool
force bool
toVersion string
checkOnly bool
dryRun bool
force bool
toVersion string
manifestURL string
pubkeyFile string
)
cmd := &cobra.Command{
Use: "update",
@ -68,10 +70,12 @@ talks to systemd. Run with sudo.
Args: noArgsUsage("usage: banger update [--check] [--dry-run] [--force] [--to vX.Y.Z]"),
RunE: func(cmd *cobra.Command, args []string) error {
return d.runUpdate(cmd, runUpdateOpts{
checkOnly: checkOnly,
dryRun: dryRun,
force: force,
toVersion: toVersion,
checkOnly: checkOnly,
dryRun: dryRun,
force: force,
toVersion: toVersion,
manifestURL: manifestURL,
pubkeyFile: pubkeyFile,
})
},
}
@ -79,23 +83,53 @@ talks to systemd. Run with sudo.
cmd.Flags().BoolVar(&dryRun, "dry-run", false, "fetch and verify, but do not swap or restart anything")
cmd.Flags().BoolVar(&force, "force", false, "skip in-flight-op refusal and post-restart doctor verification")
cmd.Flags().StringVar(&toVersion, "to", "", "specific release version to install (default: latest_stable from manifest)")
// Hidden test/dev hooks: redirect the updater at a non-default
// manifest URL and trust a non-default cosign public key. Used by
// the smoke suite to drive a real update against locally-built
// release artefacts. Production users have no reason to touch
// these; they are not advertised in --help.
cmd.Flags().StringVar(&manifestURL, "manifest-url", "", "")
cmd.Flags().StringVar(&pubkeyFile, "pubkey-file", "", "")
_ = cmd.Flags().MarkHidden("manifest-url")
_ = cmd.Flags().MarkHidden("pubkey-file")
return cmd
}
type runUpdateOpts struct {
checkOnly bool
dryRun bool
force bool
toVersion string
checkOnly bool
dryRun bool
force bool
toVersion string
manifestURL string
pubkeyFile string
}
func (d *deps) runUpdate(cmd *cobra.Command, opts runUpdateOpts) error {
ctx := cmd.Context()
out := cmd.OutOrStdout()
// Resolve the test/dev override flags up front so a bad
// --pubkey-file fails fast before any network round-trips.
pubKeyPEM := updater.BangerReleasePublicKey
if strings.TrimSpace(opts.pubkeyFile) != "" {
body, err := os.ReadFile(opts.pubkeyFile)
if err != nil {
return fmt.Errorf("read --pubkey-file: %w", err)
}
pubKeyPEM = string(body)
}
// Discover.
client := &http.Client{Timeout: 30 * time.Second}
manifest, err := updater.FetchManifest(ctx, client)
var (
manifest updater.Manifest
err error
)
if strings.TrimSpace(opts.manifestURL) != "" {
manifest, err = updater.FetchManifestFrom(ctx, client, opts.manifestURL)
} else {
manifest, err = updater.FetchManifest(ctx, client)
}
if err != nil {
return fmt.Errorf("discover: %w", err)
}
@ -142,7 +176,7 @@ func (d *deps) runUpdate(cmd *cobra.Command, opts runUpdateOpts) error {
if err != nil {
return fmt.Errorf("download: %w", err)
}
if err := updater.FetchAndVerifySignature(ctx, client, target, sumsBody); err != nil {
if err := updater.FetchAndVerifySignatureWithKey(ctx, client, target, sumsBody, pubKeyPEM); err != nil {
// Don't leave the staged tarball around — it failed
// signature verification and shouldn't be re-runnable.
_ = os.Remove(tarballPath)
@ -179,15 +213,21 @@ func (d *deps) runUpdate(cmd *cobra.Command, opts runUpdateOpts) error {
return fmt.Errorf("swap: %w (rolled back)", err)
}
// Restart services + wait for the new daemon.
// Restart services + wait for the new daemon. A `systemctl restart`
// that fails has typically already STOPPED the unit, so the prior
// binary on disk isn't running anywhere — Rollback() must be paired
// with a re-restart to bring the rolled-back binary back into a
// running state. That's rollbackAndRestart's job; rollbackAndWrap
// is for the swap-step failures earlier where the restart never
// fired and the old binary is still in memory.
if err := d.runSystemctl(ctx, "restart", installmeta.DefaultRootHelperService); err != nil {
return rollbackAndWrap(swap, "restart helper", err)
return rollbackAndRestart(ctx, d, swap, "restart helper", err)
}
if err := d.runSystemctl(ctx, "restart", installmeta.DefaultService); err != nil {
return rollbackAndWrap(swap, "restart daemon", err)
return rollbackAndRestart(ctx, d, swap, "restart daemon", err)
}
if err := d.waitForDaemonReady(ctx, socketPath); err != nil {
return rollbackAndWrap(swap, "wait daemon ready", err)
return rollbackAndRestart(ctx, d, swap, "wait daemon ready", err)
}
// Verify with doctor unless --force says otherwise.

View file

@ -178,7 +178,15 @@ func TestRenderRootHelperSystemdUnitIncludesRequiredCapabilities(t *testing.T) {
for _, want := range []string{
"ExecStart=/usr/local/bin/bangerd --root-helper",
// Both directives are load-bearing for "VM survives helper
// restart": KillMode=process limits the initial SIGTERM to
// the helper main, SendSIGKILL=no disables the SIGKILL
// escalation. The helper itself does the cgroup reparent
// (see roothelper.reparentToBangerFCCgroup) — without
// that, even these directives leave firecracker exposed to
// systemd's stop-time cleanup.
"KillMode=process",
"SendSIGKILL=no",
"Environment=TMPDIR=/run/banger-root",
"NoNewPrivileges=yes",
"PrivateTmp=yes",

View file

@ -196,6 +196,15 @@ func buildConfig(cfg MachineConfig) sdk.Config {
Smt: sdk.Bool(false),
},
VMID: cfg.VMID,
// Disable the SDK's signal-forwarding goroutine. Default
// (nil) makes the SDK install a handler that catches
// SIGTERM/SIGINT/SIGHUP/SIGQUIT/SIGABRT in the parent process
// and forwards them to the firecracker child — which means
// `systemctl stop bangerd-root.service` (sends SIGTERM to the
// helper) ends up signaling every firecracker the helper has
// launched, killing every running VM. Empty slice (not nil)
// short-circuits setupSignals at len()==0.
ForwardSignals: []os.Signal{},
}
if cfg.Jailer != nil {
// The path fields above are already chroot-translated by the
@ -267,6 +276,7 @@ func defaultDriveID(drive DriveConfig, fallback string) string {
// the configured UID:GID) — see fcproc.PrepareJailerChroot. The SDK's own
// JailerCfg path is intentionally bypassed: it cannot mknod block devices and
// does not expose --new-pid-ns.
//
func buildProcessRunner(cfg MachineConfig, logFile *os.File) *exec.Cmd {
var bin string
var args []string
@ -277,9 +287,10 @@ func buildProcessRunner(cfg MachineConfig, logFile *os.File) *exec.Cmd {
args = []string{"--api-sock", cfg.SocketPath, "--id", cfg.VMID}
}
var cmd *exec.Cmd
if os.Geteuid() == 0 {
switch {
case os.Geteuid() == 0:
cmd = exec.Command(bin, args...)
} else {
default:
cmd = exec.Command("sudo", append([]string{"-n", "-E", bin}, args...)...)
}
cmd.Stdin = nil

View file

@ -75,15 +75,23 @@ type Release struct {
// Release.
const ManifestSchemaVersion = 1
// FetchManifest downloads the release manifest and validates its
// shape. Returns an error if the server is unreachable, returns
// non-2xx, exceeds the size cap, or the schema_version is newer
// than this CLI knows.
// FetchManifest downloads the release manifest from the embedded
// canonical URL and validates its shape. Returns an error if the
// server is unreachable, returns non-2xx, exceeds the size cap, or
// the schema_version is newer than this CLI knows.
func FetchManifest(ctx context.Context, client *http.Client) (Manifest, error) {
return FetchManifestFrom(ctx, client, manifestURL)
}
// FetchManifestFrom is FetchManifest against an explicit URL. Used by
// the smoke suite (via `banger update --manifest-url …`) to drive the
// updater against a locally-served fake manifest. Production callers
// stick with FetchManifest.
func FetchManifestFrom(ctx context.Context, client *http.Client, url string) (Manifest, error) {
if client == nil {
client = http.DefaultClient
}
req, err := http.NewRequestWithContext(ctx, http.MethodGet, manifestURL, nil)
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err != nil {
return Manifest{}, err
}

View file

@ -61,18 +61,26 @@ var ErrSignatureRequired = errors.New("banger release public key is the placehol
// VerifyBlobSignature checks that sigBase64 is a valid cosign-blob
// signature over body, made with the private counterpart of
// BangerReleasePublicKey. cosign's blob signature format is a
// base64-encoded ASN.1-DER ECDSA signature over SHA256(body) — that's
// what the package's ecdsa.VerifyASN1 verifies natively.
//
// Refuses outright if the embedded public key is still the build-
// time placeholder, so an unset key can't slip through as
// "verification disabled."
// BangerReleasePublicKey.
func VerifyBlobSignature(body, sigBase64 []byte) error {
if isPlaceholderKey(BangerReleasePublicKey) {
return VerifyBlobSignatureWithKey(body, sigBase64, BangerReleasePublicKey)
}
// VerifyBlobSignatureWithKey is VerifyBlobSignature against an
// explicit PEM-encoded public key. Used by the smoke suite (via
// `banger update --pubkey-file …`) so an end-to-end update test can
// trust a locally-generated keypair without rebuilding the binary.
//
// Refuses outright if pubKeyPEM is the build-time placeholder so an
// unset key can't slip through as "verification disabled".
//
// cosign's blob signature format is a base64-encoded ASN.1-DER ECDSA
// signature over SHA256(body) — that's what ecdsa.VerifyASN1 takes.
func VerifyBlobSignatureWithKey(body, sigBase64 []byte, pubKeyPEM string) error {
if isPlaceholderKey(pubKeyPEM) {
return ErrSignatureRequired
}
block, _ := pem.Decode([]byte(BangerReleasePublicKey))
block, _ := pem.Decode([]byte(pubKeyPEM))
if block == nil {
return fmt.Errorf("decode banger release public key: no PEM block")
}
@ -96,15 +104,21 @@ func VerifyBlobSignature(body, sigBase64 []byte) error {
}
// FetchAndVerifySignature pulls the SHA256SUMS.sig URL from the
// release, downloads it (capped), and verifies it against
// sumsBody. Returns nil on a clean pass, or an error describing
// exactly why verification failed.
// release, downloads it (capped), and verifies it against sumsBody.
// Returns nil on a clean pass, or an error describing exactly why
// verification failed.
//
// If release.SHA256SumsSigURL is empty, treat that as "release was
// not signed" — refuse rather than silently proceeding. v0.1.0
// requires every release to be cosign-signed; an unsigned release
// is a manifest publishing bug we'd rather catch loudly.
func FetchAndVerifySignature(ctx context.Context, client *http.Client, release Release, sumsBody []byte) error {
return FetchAndVerifySignatureWithKey(ctx, client, release, sumsBody, BangerReleasePublicKey)
}
// FetchAndVerifySignatureWithKey is FetchAndVerifySignature against
// an explicit PEM-encoded public key.
func FetchAndVerifySignatureWithKey(ctx context.Context, client *http.Client, release Release, sumsBody []byte, pubKeyPEM string) error {
if strings.TrimSpace(release.SHA256SumsSigURL) == "" {
return fmt.Errorf("release %s has no sha256sums_sig_url; refusing to install an unsigned release", release.Version)
}
@ -115,7 +129,7 @@ func FetchAndVerifySignature(ctx context.Context, client *http.Client, release R
if err != nil {
return fmt.Errorf("fetch signature: %w", err)
}
if err := VerifyBlobSignature(sumsBody, sig); err != nil {
if err := VerifyBlobSignatureWithKey(sumsBody, sig, pubKeyPEM); err != nil {
return fmt.Errorf("verify SHA256SUMS signature: %w", err)
}
return nil

View file

@ -0,0 +1,54 @@
package updater
import (
"os/exec"
"path/filepath"
"testing"
)
// TestVerifyBlobSignatureWithOpenSSL is a confidence test for the
// smoke release-builder path: openssl's `dgst -sha256 -sign` produces
// the exact same encoding cosign emits for blob signatures (base64
// ASN.1 ECDSA over SHA256(body)). If this ever stops verifying, the
// smoke update scenarios will silently skip the signature check —
// catching it here avoids a heisenbug in scripts/smoke.sh.
func TestVerifyBlobSignatureWithOpenSSL(t *testing.T) {
if _, err := exec.LookPath("openssl"); err != nil {
t.Skip("openssl not on PATH")
}
dir := t.TempDir()
keyPath := filepath.Join(dir, "cosign.key")
pubPath := filepath.Join(dir, "cosign.pub")
bodyPath := filepath.Join(dir, "body.txt")
sigPath := filepath.Join(dir, "body.sig")
mustRun := func(name string, args ...string) {
t.Helper()
out, err := exec.Command(name, args...).CombinedOutput()
if err != nil {
t.Fatalf("%s %v: %v\n%s", name, args, err, string(out))
}
}
mustRun("openssl", "ecparam", "-name", "prime256v1", "-genkey", "-noout", "-out", keyPath)
mustRun("openssl", "ec", "-in", keyPath, "-pubout", "-out", pubPath)
mustRun("sh", "-c", "printf 'banger smoke release sums\n' > "+bodyPath)
mustRun("sh", "-c", "openssl dgst -sha256 -sign "+keyPath+" "+bodyPath+" | base64 -w0 > "+sigPath)
body := readFile(t, bodyPath)
sig := readFile(t, sigPath)
pub := readFile(t, pubPath)
if err := VerifyBlobSignatureWithKey(body, sig, string(pub)); err != nil {
t.Fatalf("VerifyBlobSignatureWithKey: %v", err)
}
}
func readFile(t *testing.T, p string) []byte {
t.Helper()
out, err := exec.Command("cat", p).Output()
if err != nil {
t.Fatalf("read %s: %v", p, err)
}
return out
}