update: VMs survive banger update and rollback

Three load-bearing fixes that together let `banger update` (and its
auto-rollback path) restart the helper + daemon without killing
every running VM. New smoke scenarios prove the property end-to-end.

Bug fixes:

1. Disable the firecracker SDK's signal-forwarding goroutine. The
   default ForwardSignals = [SIGINT, SIGQUIT, SIGTERM, SIGHUP,
   SIGABRT] installs a handler in the helper that propagates the
   helper's SIGTERM (sent by systemd on `systemctl stop bangerd-
   root.service`) to every running firecracker child. Set
   ForwardSignals to an empty (non-nil) slice so setupSignals
   short-circuits at len()==0.

2. Add SendSIGKILL=no to bangerd-root.service. KillMode=process
   limits the initial SIGTERM to the helper main, but systemd
   still SIGKILLs leftover cgroup processes during the
   FinalKillSignal stage unless SendSIGKILL=no.

3. Route restart-helper / restart-daemon / wait-daemon-ready
   failures through rollbackAndRestart instead of rollbackAndWrap.
   rollbackAndWrap restored .previous binaries but didn't re-
   restart the failed unit, leaving the helper dead with the
   rolled-back binary on disk after a failed update.

Testing infrastructure (production binaries unaffected):

- Hidden --manifest-url and --pubkey-file flags on `banger update`
  let the smoke harness redirect the updater at locally-built
  release artefacts. Marked Hidden in cobra; not advertised in
  --help.
- FetchManifestFrom / VerifyBlobSignatureWithKey /
  FetchAndVerifySignatureWithKey export the existing logic against
  caller-supplied URL / pubkey. The default entry points still
  call them with the embedded canonical values.

Smoke scenarios:

- update_check: --check against fake manifest reports update
  available
- update_to_unknown: --to v9.9.9 fails before any host mutation
- update_no_root: refuses without sudo, install untouched
- update_dry_run: stages + verifies, no swap, version unchanged
- update_keeps_vm_alive: real swap to v0.smoke.0; same VM (same
  boot_id) answers SSH after the daemon restart
- update_rollback_keeps_vm_alive: v0.smoke.broken-bangerd ships a
  bangerd that passes --check-migrations but exits 1 as the
  daemon. The post-swap `systemctl restart bangerd` fails,
  rollbackAndRestart fires, the .previous binaries are restored
  and re-restarted; the same VM still answers SSH afterwards
- daemon_admin (separate prep): covers `banger daemon socket`,
  `bangerd --check-migrations --system`, `sudo banger daemon
  stop`

The smoke release builder generates a fresh ECDSA P-256 keypair
with openssl, signs SHA256SUMS cosign-compatibly, and serves
artefacts from a backgrounded python http.server.
verify_smoke_check_test.go pins the openssl/cosign signature
equivalence so the smoke release builder can't silently drift.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Thales Maciel 2026-05-01 12:08:08 -03:00
parent 7e528f30b3
commit 2606bfbabb
No known key found for this signature in database
GPG key ID: 33112E6833C34679
8 changed files with 609 additions and 50 deletions

View file

@ -364,18 +364,34 @@ func renderRootHelperSystemdUnit() string {
"ExecStart=" + systemBangerdBin + " --root-helper",
"Restart=on-failure",
"RestartSec=1s",
// KillMode=process is load-bearing: the helper unit's cgroup is
// where every banger-launched firecracker process lives (see
// validateFirecrackerPID). Without this, `systemctl restart
// bangerd-root.service` — which `banger update` runs — would
// SIGKILL every in-flight VM along with the helper because
// systemd's default KillMode=control-group nukes the whole cgroup.
// With process mode, only the helper PID is signaled; firecracker
// children survive, the new helper instance re-attaches via the
// helper RPC, daemon reconcile re-seeds in-memory state, VM keeps
// running. `banger system uninstall` and the daemon's vm-stop
// path explicitly stop firecracker processes when actually needed.
// KillMode=process + SendSIGKILL=no together make the helper
// safe to restart while banger-launched firecrackers are
// running. firecracker lives in this unit's cgroup (jailer
// doesn't open a sub-cgroup), so:
//
// - Default control-group mode SIGKILLs every process in
// the cgroup on stop.
// - KillMode=process limits the initial SIGTERM to the
// helper main PID; systemd leaves remaining cgroup
// processes alone (and logs "Unit process N (firecracker)
// remains running after unit stopped").
// - SendSIGKILL=no disables the FinalKillSignal escalation
// that would otherwise SIGKILL leftovers after the timeout.
//
// One more pitfall: the firecracker SDK installs a default
// signal-forwarding goroutine in the helper that catches
// SIGTERM (etc.) and forwards it to every firecracker child.
// We disable that explicitly via ForwardSignals: []os.Signal{}
// in firecracker.buildConfig — without that override, systemd
// signaling the helper main would propagate to every running
// VM regardless of what these directives do.
//
// `banger system uninstall` and the daemon's vm-stop path
// explicitly stop firecracker processes when actually needed,
// so we don't lose the systemd-driven kill as a real safety
// net — banger drives those kills itself.
"KillMode=process",
"SendSIGKILL=no",
"Environment=PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
"Environment=TMPDIR=" + installmeta.DefaultRootHelperRuntimeDir,
"UMask=0077",

View file

@ -30,10 +30,12 @@ const stagingTarballName = "release.tar.gz"
func (d *deps) newUpdateCommand() *cobra.Command {
var (
checkOnly bool
dryRun bool
force bool
toVersion string
checkOnly bool
dryRun bool
force bool
toVersion string
manifestURL string
pubkeyFile string
)
cmd := &cobra.Command{
Use: "update",
@ -68,10 +70,12 @@ talks to systemd. Run with sudo.
Args: noArgsUsage("usage: banger update [--check] [--dry-run] [--force] [--to vX.Y.Z]"),
RunE: func(cmd *cobra.Command, args []string) error {
return d.runUpdate(cmd, runUpdateOpts{
checkOnly: checkOnly,
dryRun: dryRun,
force: force,
toVersion: toVersion,
checkOnly: checkOnly,
dryRun: dryRun,
force: force,
toVersion: toVersion,
manifestURL: manifestURL,
pubkeyFile: pubkeyFile,
})
},
}
@ -79,23 +83,53 @@ talks to systemd. Run with sudo.
cmd.Flags().BoolVar(&dryRun, "dry-run", false, "fetch and verify, but do not swap or restart anything")
cmd.Flags().BoolVar(&force, "force", false, "skip in-flight-op refusal and post-restart doctor verification")
cmd.Flags().StringVar(&toVersion, "to", "", "specific release version to install (default: latest_stable from manifest)")
// Hidden test/dev hooks: redirect the updater at a non-default
// manifest URL and trust a non-default cosign public key. Used by
// the smoke suite to drive a real update against locally-built
// release artefacts. Production users have no reason to touch
// these; they are not advertised in --help.
cmd.Flags().StringVar(&manifestURL, "manifest-url", "", "")
cmd.Flags().StringVar(&pubkeyFile, "pubkey-file", "", "")
_ = cmd.Flags().MarkHidden("manifest-url")
_ = cmd.Flags().MarkHidden("pubkey-file")
return cmd
}
type runUpdateOpts struct {
checkOnly bool
dryRun bool
force bool
toVersion string
checkOnly bool
dryRun bool
force bool
toVersion string
manifestURL string
pubkeyFile string
}
func (d *deps) runUpdate(cmd *cobra.Command, opts runUpdateOpts) error {
ctx := cmd.Context()
out := cmd.OutOrStdout()
// Resolve the test/dev override flags up front so a bad
// --pubkey-file fails fast before any network round-trips.
pubKeyPEM := updater.BangerReleasePublicKey
if strings.TrimSpace(opts.pubkeyFile) != "" {
body, err := os.ReadFile(opts.pubkeyFile)
if err != nil {
return fmt.Errorf("read --pubkey-file: %w", err)
}
pubKeyPEM = string(body)
}
// Discover.
client := &http.Client{Timeout: 30 * time.Second}
manifest, err := updater.FetchManifest(ctx, client)
var (
manifest updater.Manifest
err error
)
if strings.TrimSpace(opts.manifestURL) != "" {
manifest, err = updater.FetchManifestFrom(ctx, client, opts.manifestURL)
} else {
manifest, err = updater.FetchManifest(ctx, client)
}
if err != nil {
return fmt.Errorf("discover: %w", err)
}
@ -142,7 +176,7 @@ func (d *deps) runUpdate(cmd *cobra.Command, opts runUpdateOpts) error {
if err != nil {
return fmt.Errorf("download: %w", err)
}
if err := updater.FetchAndVerifySignature(ctx, client, target, sumsBody); err != nil {
if err := updater.FetchAndVerifySignatureWithKey(ctx, client, target, sumsBody, pubKeyPEM); err != nil {
// Don't leave the staged tarball around — it failed
// signature verification and shouldn't be re-runnable.
_ = os.Remove(tarballPath)
@ -179,15 +213,21 @@ func (d *deps) runUpdate(cmd *cobra.Command, opts runUpdateOpts) error {
return fmt.Errorf("swap: %w (rolled back)", err)
}
// Restart services + wait for the new daemon.
// Restart services + wait for the new daemon. A `systemctl restart`
// that fails has typically already STOPPED the unit, so the prior
// binary on disk isn't running anywhere — Rollback() must be paired
// with a re-restart to bring the rolled-back binary back into a
// running state. That's rollbackAndRestart's job; rollbackAndWrap
// is for the swap-step failures earlier where the restart never
// fired and the old binary is still in memory.
if err := d.runSystemctl(ctx, "restart", installmeta.DefaultRootHelperService); err != nil {
return rollbackAndWrap(swap, "restart helper", err)
return rollbackAndRestart(ctx, d, swap, "restart helper", err)
}
if err := d.runSystemctl(ctx, "restart", installmeta.DefaultService); err != nil {
return rollbackAndWrap(swap, "restart daemon", err)
return rollbackAndRestart(ctx, d, swap, "restart daemon", err)
}
if err := d.waitForDaemonReady(ctx, socketPath); err != nil {
return rollbackAndWrap(swap, "wait daemon ready", err)
return rollbackAndRestart(ctx, d, swap, "wait daemon ready", err)
}
// Verify with doctor unless --force says otherwise.

View file

@ -178,7 +178,15 @@ func TestRenderRootHelperSystemdUnitIncludesRequiredCapabilities(t *testing.T) {
for _, want := range []string{
"ExecStart=/usr/local/bin/bangerd --root-helper",
// Both directives are load-bearing for "VM survives helper
// restart": KillMode=process limits the initial SIGTERM to
// the helper main, SendSIGKILL=no disables the SIGKILL
// escalation. The helper itself does the cgroup reparent
// (see roothelper.reparentToBangerFCCgroup) — without
// that, even these directives leave firecracker exposed to
// systemd's stop-time cleanup.
"KillMode=process",
"SendSIGKILL=no",
"Environment=TMPDIR=/run/banger-root",
"NoNewPrivileges=yes",
"PrivateTmp=yes",

View file

@ -196,6 +196,15 @@ func buildConfig(cfg MachineConfig) sdk.Config {
Smt: sdk.Bool(false),
},
VMID: cfg.VMID,
// Disable the SDK's signal-forwarding goroutine. Default
// (nil) makes the SDK install a handler that catches
// SIGTERM/SIGINT/SIGHUP/SIGQUIT/SIGABRT in the parent process
// and forwards them to the firecracker child — which means
// `systemctl stop bangerd-root.service` (sends SIGTERM to the
// helper) ends up signaling every firecracker the helper has
// launched, killing every running VM. Empty slice (not nil)
// short-circuits setupSignals at len()==0.
ForwardSignals: []os.Signal{},
}
if cfg.Jailer != nil {
// The path fields above are already chroot-translated by the
@ -267,6 +276,7 @@ func defaultDriveID(drive DriveConfig, fallback string) string {
// the configured UID:GID) — see fcproc.PrepareJailerChroot. The SDK's own
// JailerCfg path is intentionally bypassed: it cannot mknod block devices and
// does not expose --new-pid-ns.
//
func buildProcessRunner(cfg MachineConfig, logFile *os.File) *exec.Cmd {
var bin string
var args []string
@ -277,9 +287,10 @@ func buildProcessRunner(cfg MachineConfig, logFile *os.File) *exec.Cmd {
args = []string{"--api-sock", cfg.SocketPath, "--id", cfg.VMID}
}
var cmd *exec.Cmd
if os.Geteuid() == 0 {
switch {
case os.Geteuid() == 0:
cmd = exec.Command(bin, args...)
} else {
default:
cmd = exec.Command("sudo", append([]string{"-n", "-E", bin}, args...)...)
}
cmd.Stdin = nil

View file

@ -75,15 +75,23 @@ type Release struct {
// Release.
const ManifestSchemaVersion = 1
// FetchManifest downloads the release manifest and validates its
// shape. Returns an error if the server is unreachable, returns
// non-2xx, exceeds the size cap, or the schema_version is newer
// than this CLI knows.
// FetchManifest downloads the release manifest from the embedded
// canonical URL and validates its shape. Returns an error if the
// server is unreachable, returns non-2xx, exceeds the size cap, or
// the schema_version is newer than this CLI knows.
func FetchManifest(ctx context.Context, client *http.Client) (Manifest, error) {
return FetchManifestFrom(ctx, client, manifestURL)
}
// FetchManifestFrom is FetchManifest against an explicit URL. Used by
// the smoke suite (via `banger update --manifest-url …`) to drive the
// updater against a locally-served fake manifest. Production callers
// stick with FetchManifest.
func FetchManifestFrom(ctx context.Context, client *http.Client, url string) (Manifest, error) {
if client == nil {
client = http.DefaultClient
}
req, err := http.NewRequestWithContext(ctx, http.MethodGet, manifestURL, nil)
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err != nil {
return Manifest{}, err
}

View file

@ -61,18 +61,26 @@ var ErrSignatureRequired = errors.New("banger release public key is the placehol
// VerifyBlobSignature checks that sigBase64 is a valid cosign-blob
// signature over body, made with the private counterpart of
// BangerReleasePublicKey. cosign's blob signature format is a
// base64-encoded ASN.1-DER ECDSA signature over SHA256(body) — that's
// what the package's ecdsa.VerifyASN1 verifies natively.
//
// Refuses outright if the embedded public key is still the build-
// time placeholder, so an unset key can't slip through as
// "verification disabled."
// BangerReleasePublicKey.
func VerifyBlobSignature(body, sigBase64 []byte) error {
if isPlaceholderKey(BangerReleasePublicKey) {
return VerifyBlobSignatureWithKey(body, sigBase64, BangerReleasePublicKey)
}
// VerifyBlobSignatureWithKey is VerifyBlobSignature against an
// explicit PEM-encoded public key. Used by the smoke suite (via
// `banger update --pubkey-file …`) so an end-to-end update test can
// trust a locally-generated keypair without rebuilding the binary.
//
// Refuses outright if pubKeyPEM is the build-time placeholder so an
// unset key can't slip through as "verification disabled".
//
// cosign's blob signature format is a base64-encoded ASN.1-DER ECDSA
// signature over SHA256(body) — that's what ecdsa.VerifyASN1 takes.
func VerifyBlobSignatureWithKey(body, sigBase64 []byte, pubKeyPEM string) error {
if isPlaceholderKey(pubKeyPEM) {
return ErrSignatureRequired
}
block, _ := pem.Decode([]byte(BangerReleasePublicKey))
block, _ := pem.Decode([]byte(pubKeyPEM))
if block == nil {
return fmt.Errorf("decode banger release public key: no PEM block")
}
@ -96,15 +104,21 @@ func VerifyBlobSignature(body, sigBase64 []byte) error {
}
// FetchAndVerifySignature pulls the SHA256SUMS.sig URL from the
// release, downloads it (capped), and verifies it against
// sumsBody. Returns nil on a clean pass, or an error describing
// exactly why verification failed.
// release, downloads it (capped), and verifies it against sumsBody.
// Returns nil on a clean pass, or an error describing exactly why
// verification failed.
//
// If release.SHA256SumsSigURL is empty, treat that as "release was
// not signed" — refuse rather than silently proceeding. v0.1.0
// requires every release to be cosign-signed; an unsigned release
// is a manifest publishing bug we'd rather catch loudly.
func FetchAndVerifySignature(ctx context.Context, client *http.Client, release Release, sumsBody []byte) error {
return FetchAndVerifySignatureWithKey(ctx, client, release, sumsBody, BangerReleasePublicKey)
}
// FetchAndVerifySignatureWithKey is FetchAndVerifySignature against
// an explicit PEM-encoded public key.
func FetchAndVerifySignatureWithKey(ctx context.Context, client *http.Client, release Release, sumsBody []byte, pubKeyPEM string) error {
if strings.TrimSpace(release.SHA256SumsSigURL) == "" {
return fmt.Errorf("release %s has no sha256sums_sig_url; refusing to install an unsigned release", release.Version)
}
@ -115,7 +129,7 @@ func FetchAndVerifySignature(ctx context.Context, client *http.Client, release R
if err != nil {
return fmt.Errorf("fetch signature: %w", err)
}
if err := VerifyBlobSignature(sumsBody, sig); err != nil {
if err := VerifyBlobSignatureWithKey(sumsBody, sig, pubKeyPEM); err != nil {
return fmt.Errorf("verify SHA256SUMS signature: %w", err)
}
return nil

View file

@ -0,0 +1,54 @@
package updater
import (
"os/exec"
"path/filepath"
"testing"
)
// TestVerifyBlobSignatureWithOpenSSL is a confidence test for the
// smoke release-builder path: openssl's `dgst -sha256 -sign` produces
// the exact same encoding cosign emits for blob signatures (base64
// ASN.1 ECDSA over SHA256(body)). If this ever stops verifying, the
// smoke update scenarios will silently skip the signature check —
// catching it here avoids a heisenbug in scripts/smoke.sh.
func TestVerifyBlobSignatureWithOpenSSL(t *testing.T) {
if _, err := exec.LookPath("openssl"); err != nil {
t.Skip("openssl not on PATH")
}
dir := t.TempDir()
keyPath := filepath.Join(dir, "cosign.key")
pubPath := filepath.Join(dir, "cosign.pub")
bodyPath := filepath.Join(dir, "body.txt")
sigPath := filepath.Join(dir, "body.sig")
mustRun := func(name string, args ...string) {
t.Helper()
out, err := exec.Command(name, args...).CombinedOutput()
if err != nil {
t.Fatalf("%s %v: %v\n%s", name, args, err, string(out))
}
}
mustRun("openssl", "ecparam", "-name", "prime256v1", "-genkey", "-noout", "-out", keyPath)
mustRun("openssl", "ec", "-in", keyPath, "-pubout", "-out", pubPath)
mustRun("sh", "-c", "printf 'banger smoke release sums\n' > "+bodyPath)
mustRun("sh", "-c", "openssl dgst -sha256 -sign "+keyPath+" "+bodyPath+" | base64 -w0 > "+sigPath)
body := readFile(t, bodyPath)
sig := readFile(t, sigPath)
pub := readFile(t, pubPath)
if err := VerifyBlobSignatureWithKey(body, sig, string(pub)); err != nil {
t.Fatalf("VerifyBlobSignatureWithKey: %v", err)
}
}
func readFile(t *testing.T, p string) []byte {
t.Helper()
out, err := exec.Command("cat", p).Output()
if err != nil {
t.Fatalf("read %s: %v", p, err)
}
return out
}

View file

@ -80,6 +80,13 @@ SMOKE_SCENARIOS=(
nat
invalid_spec
invalid_name
update_check
update_to_unknown
update_no_root
update_dry_run
update_keeps_vm_alive
update_rollback_keeps_vm_alive
daemon_admin
)
declare -A SMOKE_DESCS=(
@ -104,6 +111,13 @@ declare -A SMOKE_DESCS=(
[nat]="--nat installs per-VM MASQUERADE; control VM does not"
[invalid_spec]="--vcpu 0 rejected, no VM row leaked"
[invalid_name]="bad names (uppercase/space/dot/leading-hyphen) all rejected"
[update_check]="update --check reports update-available against fake manifest"
[update_to_unknown]="update --to v9.9.9 fails before any host mutation"
[update_no_root]="update without sudo refuses with a root-required error"
[update_dry_run]="update --dry-run fetches + verifies but does not swap"
[update_keeps_vm_alive]="update v0.smoke.0: VM SSH survives the daemon restart, install.toml + version flip"
[update_rollback_keeps_vm_alive]="rollback drill: broken-bangerd release fails to start, Rollback fires, binary reverts, VM SSH survives"
[daemon_admin]="daemon socket prints sock path; --check-migrations reports compatible; daemon stop tears services down"
)
declare -A SMOKE_CLASS=(
@ -128,6 +142,13 @@ declare -A SMOKE_CLASS=(
[nat]=global
[invalid_spec]=global
[invalid_name]=global
[update_check]=global
[update_to_unknown]=global
[update_no_root]=global
[update_dry_run]=global
[update_keeps_vm_alive]=global
[update_rollback_keeps_vm_alive]=global
[daemon_admin]=global
)
usage() {
@ -306,15 +327,24 @@ sudo_banger() {
sudo env GOCOVERDIR="$BANGER_SMOKE_COVER_DIR" "$@"
}
cleanup_release_server() {
if [[ -n "${RELEASE_HTTP_PID:-}" ]] && kill -0 "$RELEASE_HTTP_PID" 2>/dev/null; then
kill "$RELEASE_HTTP_PID" 2>/dev/null || true
wait "$RELEASE_HTTP_PID" 2>/dev/null || true
fi
}
cleanup() {
set +e
for vm in \
smoke-lifecycle smoke-set smoke-restart smoke-kill smoke-ports smoke-fc \
smoke-basecommit smoke-exec smoke-wsrestart smoke-nat smoke-nocnat; do
smoke-basecommit smoke-exec smoke-wsrestart smoke-nat smoke-nocnat \
smoke-update smoke-rollback; do
"$BANGER" vm delete "$vm" >/dev/null 2>&1 || true
done
cleanup_export_vm
cleanup_prune
cleanup_release_server
stop_services_for_coverage
collect_service_coverage
sudo_banger "$BANGER" system uninstall --purge >/dev/null 2>&1 || true
@ -885,6 +915,384 @@ scenario_invalid_name() {
|| die "invalid name leaked VM row(s): pre=$pre_vms, post=$post_vms"
}
# ---------------------------------------------------------------------
# Update flow: locally-built release artefacts + a backgrounded HTTP
# server stand in for the real Cloudflare R2 bucket. The hidden
# --manifest-url and --pubkey-file flags on `banger update` redirect
# the updater at this fake bucket. Production binaries reject anything
# that isn't signed by the embedded cosign key, so smoke generates a
# fresh ECDSA keypair and points the updater at the matching pub key.
# ---------------------------------------------------------------------
# Tracks whether prepare_smoke_releases has run so per-scenario calls
# are cheap idempotent on the second hit (full suite invokes them in
# sequence; --scenario filtering may skip ahead).
SMOKE_RELEASES_READY=0
RELEASE_HTTP_PID=
RELEASE_PORT=
MANIFEST_URL=
PUBKEY_FILE=
prepare_smoke_releases() {
if (( SMOKE_RELEASES_READY == 1 )); then return 0; fi
local rel_dir="$scratch_root/release"
rm -rf "$rel_dir" && mkdir -p "$rel_dir"
# Generate ECDSA P-256 keypair (cosign blob signatures are an ASN.1
# ECDSA signature over SHA256(body); openssl produces the same
# encoding via `openssl dgst -sha256 -sign`).
command -v openssl >/dev/null 2>&1 || die 'update scenarios need openssl'
command -v python3 >/dev/null 2>&1 || die 'update scenarios need python3'
openssl ecparam -name prime256v1 -genkey -noout -out "$rel_dir/cosign.key" 2>/dev/null \
|| die 'openssl: keypair generation failed'
openssl ec -in "$rel_dir/cosign.key" -pubout -out "$rel_dir/cosign.pub" 2>/dev/null \
|| die 'openssl: public key extraction failed'
PUBKEY_FILE="$rel_dir/cosign.pub"
build_smoke_release_tarball "$rel_dir" v0.smoke.0
build_smoke_release_tarball "$rel_dir" v0.smoke.broken-bangerd
# Background a tiny HTTP server. Port 0 lets the kernel pick a free
# port; the python harness prints the chosen port on stdout so we
# can compose the manifest URLs once we know it.
local port_file="$rel_dir/.port"
: >"$port_file"
python3 -u -c "
import http.server, socketserver, sys, os
os.chdir(sys.argv[1])
class H(http.server.SimpleHTTPRequestHandler):
def log_message(self, *a, **kw): pass
with socketserver.TCPServer(('127.0.0.1', 0), H) as srv:
sys.stdout.write(str(srv.server_address[1]) + '\n'); sys.stdout.flush()
srv.serve_forever()
" "$rel_dir" >"$port_file" 2>/dev/null &
RELEASE_HTTP_PID=$!
local i
for i in $(seq 1 50); do
[[ -s "$port_file" ]] && break
sleep 0.1
done
RELEASE_PORT="$(head -n1 "$port_file")"
[[ -n "$RELEASE_PORT" ]] || die 'release HTTP server did not announce a port'
MANIFEST_URL="http://127.0.0.1:$RELEASE_PORT/manifest.json"
write_smoke_manifest "$rel_dir/manifest.json" "http://127.0.0.1:$RELEASE_PORT"
SMOKE_RELEASES_READY=1
log "release server ready at $MANIFEST_URL"
}
# Builds banger / bangerd / banger-vsock-agent under -ldflags pointing
# Version at $version, tarballs them, writes a sha256sums file, and
# signs it with the smoke release key. Output:
# $rel_dir/$version/banger-$version-linux-amd64.tar.gz
# $rel_dir/$version/SHA256SUMS
# $rel_dir/$version/SHA256SUMS.sig
build_smoke_release_tarball() {
local rel_dir="$1"
local version="$2"
local out_dir="$rel_dir/$version"
local stage="$out_dir/.stage"
mkdir -p "$stage"
local ldflags="-X banger/internal/buildinfo.Version=$version -X banger/internal/buildinfo.Commit=smoke -X banger/internal/buildinfo.BuiltAt=2026-04-30T00:00:00Z"
( cd "$(repo_root)" && go build -ldflags "$ldflags" -o "$stage/banger" ./cmd/banger ) \
|| die "build banger@$version failed"
if [[ "$version" == v0.smoke.broken-* ]]; then
# v0.smoke.broken-* is the rollback drill's intentionally-broken
# release: bangerd passes the pre-swap --check-migrations sanity
# (so the swap proceeds) but exits non-zero in service mode (so
# the post-swap `systemctl restart bangerd` fires runUpdate's
# rollbackAndWrap path). Shell script is enough — systemd's
# ExecStart= handles the shebang.
cat >"$stage/bangerd" <<'BROKEN'
#!/bin/sh
case "$*" in
*--check-migrations*)
printf 'compatible: smoke broken-bangerd pretends to be ready\n'
exit 0
;;
*)
printf 'smoke broken-bangerd: refusing to run as daemon\n' >&2
exit 1
;;
esac
BROKEN
chmod 0755 "$stage/bangerd"
else
( cd "$(repo_root)" && go build -ldflags "$ldflags" -o "$stage/bangerd" ./cmd/bangerd ) \
|| die "build bangerd@$version failed"
fi
( cd "$(repo_root)" && CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -ldflags "$ldflags" -o "$stage/banger-vsock-agent" ./cmd/banger-vsock-agent ) \
|| die "build banger-vsock-agent@$version failed"
local tarball_name="banger-$version-linux-amd64.tar.gz"
( cd "$stage" && tar czf "$out_dir/$tarball_name" banger bangerd banger-vsock-agent ) \
|| die "tar $version failed"
local hash
hash="$(sha256sum "$out_dir/$tarball_name" | awk '{print $1}')"
printf '%s %s\n' "$hash" "$tarball_name" >"$out_dir/SHA256SUMS"
# cosign blob signature == base64(ECDSA-ASN.1 over SHA256(body)).
# `openssl dgst -sha256 -sign` produces the exact same encoding.
openssl dgst -sha256 -sign "$rel_dir/cosign.key" "$out_dir/SHA256SUMS" \
| base64 -w0 >"$out_dir/SHA256SUMS.sig" || die "sign SHA256SUMS for $version failed"
rm -rf "$stage"
}
repo_root() {
# smoke.sh lives at $repo/scripts/smoke.sh; resolve the repo dir
# without depending on PWD or BASH_SOURCE-relative cwd at call time.
local script_dir
script_dir="$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")"
( cd "$script_dir/.." && pwd )
}
write_smoke_manifest() {
local path="$1"
local base="$2"
cat >"$path" <<MANIFEST
{
"schema_version": 1,
"latest_stable": "v0.smoke.0",
"releases": [
{
"version": "v0.smoke.0",
"tarball_url": "$base/v0.smoke.0/banger-v0.smoke.0-linux-amd64.tar.gz",
"sha256sums_url": "$base/v0.smoke.0/SHA256SUMS",
"sha256sums_sig_url": "$base/v0.smoke.0/SHA256SUMS.sig",
"released_at": "2026-04-29T00:00:00Z"
},
{
"version": "v0.smoke.broken-bangerd",
"tarball_url": "$base/v0.smoke.broken-bangerd/banger-v0.smoke.broken-bangerd-linux-amd64.tar.gz",
"sha256sums_url": "$base/v0.smoke.broken-bangerd/SHA256SUMS",
"sha256sums_sig_url": "$base/v0.smoke.broken-bangerd/SHA256SUMS.sig",
"released_at": "2026-04-30T00:00:00Z"
}
]
}
MANIFEST
}
# installed_version returns the "version" field that the freshly
# installed `banger` CLI reports — i.e. /usr/local/bin/banger, NOT the
# smoke build dir. This is what changes after `banger update` swaps
# the on-disk binary.
installed_version() {
/usr/local/bin/banger --version 2>/dev/null | awk '{print $2}'
}
scenario_update_check() {
log "${SMOKE_DESCS[update_check]}"
prepare_smoke_releases
local out
out="$("$BANGER" update --check \
--manifest-url "$MANIFEST_URL" --pubkey-file "$PUBKEY_FILE" 2>&1)" \
|| die "update --check failed: $out"
grep -q 'update available: ' <<<"$out" \
|| die "update --check stdout missing 'update available:' line; got: $out"
}
scenario_update_to_unknown() {
log "${SMOKE_DESCS[update_to_unknown]}"
prepare_smoke_releases
local pre_ver post_ver out rc
pre_ver="$(installed_version)"
set +e
out="$("$BANGER" update --to v9.9.9 \
--manifest-url "$MANIFEST_URL" --pubkey-file "$PUBKEY_FILE" 2>&1)"
rc=$?
set -e
[[ "$rc" -ne 0 ]] || die "update --to v9.9.9: exit 0 (out: $out)"
grep -qi 'not found' <<<"$out" \
|| die "update --to v9.9.9: error doesn't say 'not found'; got: $out"
post_ver="$(installed_version)"
[[ "$pre_ver" == "$post_ver" ]] \
|| die "update --to v9.9.9 mutated the install: $pre_ver -> $post_ver"
}
scenario_update_no_root() {
log "${SMOKE_DESCS[update_no_root]}"
prepare_smoke_releases
local pre_ver post_ver out rc
pre_ver="$(installed_version)"
set +e
out="$("$BANGER" update --to v0.smoke.0 \
--manifest-url "$MANIFEST_URL" --pubkey-file "$PUBKEY_FILE" 2>&1)"
rc=$?
set -e
[[ "$rc" -ne 0 ]] || die "update without sudo: exit 0 (out: $out)"
grep -qi 'root' <<<"$out" \
|| die "update without sudo: error doesn't mention root; got: $out"
post_ver="$(installed_version)"
[[ "$pre_ver" == "$post_ver" ]] \
|| die "update without sudo mutated the install: $pre_ver -> $post_ver"
}
scenario_update_dry_run() {
log "${SMOKE_DESCS[update_dry_run]}"
prepare_smoke_releases
if ! sudo -n true 2>/dev/null; then
log 'update_dry_run: passwordless sudo unavailable; skipping'
return 0
fi
local pre_ver post_ver out
pre_ver="$(installed_version)"
out="$(sudo_banger "$BANGER" update --to v0.smoke.0 --dry-run \
--manifest-url "$MANIFEST_URL" --pubkey-file "$PUBKEY_FILE" 2>&1)" \
|| die "update --dry-run failed: $out"
grep -q 'dry-run:' <<<"$out" \
|| die "update --dry-run stdout missing 'dry-run:' marker; got: $out"
post_ver="$(installed_version)"
[[ "$pre_ver" == "$post_ver" ]] \
|| die "update --dry-run swapped the binary: $pre_ver -> $post_ver"
}
# vm_boot_id reads /proc/sys/kernel/random/boot_id from inside the
# given guest. That value is regenerated by the kernel on every boot,
# so it's a clean way to assert "the VM did NOT reboot" — daemon
# restart does not touch the running firecracker process, so a guest
# kernel that survives the daemon restart returns the same boot_id.
vm_boot_id() {
"$BANGER" vm ssh "$1" -- cat /proc/sys/kernel/random/boot_id 2>/dev/null
}
scenario_update_keeps_vm_alive() {
log "${SMOKE_DESCS[update_keeps_vm_alive]}"
prepare_smoke_releases
if ! sudo -n true 2>/dev/null; then
log 'update_keeps_vm_alive: passwordless sudo unavailable; skipping'
return 0
fi
"$BANGER" vm create --name smoke-update >/dev/null \
|| die 'create smoke-update failed'
wait_for_ssh smoke-update || die 'smoke-update unreachable pre-update'
local pre_boot post_boot pre_ver post_ver
pre_boot="$(vm_boot_id smoke-update)"
[[ -n "$pre_boot" ]] || die 'pre-update boot_id capture failed'
pre_ver="$(installed_version)"
sudo_banger "$BANGER" update --to v0.smoke.0 \
--manifest-url "$MANIFEST_URL" --pubkey-file "$PUBKEY_FILE" >/dev/null \
|| die 'update --to v0.smoke.0 failed'
post_ver="$(installed_version)"
[[ "$post_ver" == "v0.smoke.0" ]] \
|| die "post-update /usr/local/bin/banger version = $post_ver, want v0.smoke.0"
[[ "$pre_ver" != "$post_ver" ]] \
|| die "update did not change the binary version (pre==post=$post_ver)"
local meta_ver
meta_ver="$(sudo grep -E '^version[[:space:]]*=' /etc/banger/install.toml | sed -E 's/.*"([^"]+)".*/\1/')"
[[ "$meta_ver" == "v0.smoke.0" ]] \
|| die "install.toml version = '$meta_ver', want v0.smoke.0"
if ! wait_for_ssh smoke-update; then
log 'smoke-update unreachable AFTER update; dumping diagnostics:'
"$BANGER" vm show smoke-update 2>&1 | sed 's/^/ show: /' >&2 || true
pgrep -af firecracker | sed 's/^/ fc-procs: /' >&2 || true
sudo grep -E 'KillMode|SendSIGKILL' /etc/systemd/system/bangerd-root.service 2>&1 | sed 's/^/ unit: /' >&2 || true
systemctl show bangerd-root.service --property=KillMode,SendSIGKILL,FinalKillSignal 2>&1 | sed 's/^/ unit-prop: /' >&2 || true
sudo journalctl -u bangerd.service -u bangerd-root.service --since '120 seconds ago' --no-pager 2>&1 | tail -40 | sed 's/^/ journal: /' >&2 || true
die 'smoke-update unreachable AFTER update — daemon restart likely killed VM'
fi
post_boot="$(vm_boot_id smoke-update)"
[[ -n "$post_boot" ]] || die 'post-update boot_id read failed'
[[ "$pre_boot" == "$post_boot" ]] \
|| die "VM rebooted during update: boot_id $pre_boot -> $post_boot"
"$BANGER" vm delete smoke-update >/dev/null 2>&1 || true
}
scenario_update_rollback_keeps_vm_alive() {
log "${SMOKE_DESCS[update_rollback_keeps_vm_alive]}"
prepare_smoke_releases
if ! sudo -n true 2>/dev/null; then
log 'update_rollback_keeps_vm_alive: passwordless sudo unavailable; skipping'
return 0
fi
# The v0.smoke.broken-bangerd release ships a bangerd that passes
# the pre-swap --check-migrations sanity (so the swap proceeds) but
# exits non-zero when systemd starts it as the daemon. That trips
# runUpdate's `restart bangerd` step: rollbackAndWrap runs, the
# previous binaries are restored from .previous, and the helper +
# daemon are re-restarted onto the prior install.
local pre_ver
pre_ver="$(installed_version)"
"$BANGER" vm create --name smoke-rollback >/dev/null \
|| die 'create smoke-rollback failed'
wait_for_ssh smoke-rollback || die 'smoke-rollback unreachable pre-drill'
local pre_boot post_boot
pre_boot="$(vm_boot_id smoke-rollback)"
[[ -n "$pre_boot" ]] || die 'pre-drill boot_id capture failed'
local rc upd_log
upd_log="$scratch_root/rollback-update.log"
set +e
sudo_banger "$BANGER" update --to v0.smoke.broken-bangerd \
--manifest-url "$MANIFEST_URL" --pubkey-file "$PUBKEY_FILE" >"$upd_log" 2>&1
rc=$?
set -e
[[ "$rc" -ne 0 ]] || {
log 'rollback drill: update returned exit 0 despite broken bangerd'
sed 's/^/ upd: /' "$upd_log" >&2 || true
die 'rollback drill: expected non-zero exit'
}
# Rollback should have restored the binaries to whatever was running
# pre-update.
local post_ver
post_ver="$(installed_version)"
[[ "$post_ver" == "$pre_ver" ]] \
|| die "rollback drill: post-rollback version = $post_ver, want $pre_ver"
wait_for_ssh smoke-rollback \
|| die 'smoke-rollback unreachable AFTER rollback — VM did not survive'
post_boot="$(vm_boot_id smoke-rollback)"
[[ -n "$post_boot" ]] || die 'post-rollback boot_id read failed'
[[ "$pre_boot" == "$post_boot" ]] \
|| die "VM rebooted during rollback drill: boot_id $pre_boot -> $post_boot"
"$BANGER" vm delete smoke-rollback >/dev/null 2>&1 || true
}
# daemon_admin must be the LAST scenario in the registry: `banger daemon
# stop` tears the installed services down, so anything after it that
# touches the daemon would fail. Cleanup re-stops idempotently and the
# uninstall path doesn't need active services.
scenario_daemon_admin() {
log "${SMOKE_DESCS[daemon_admin]}"
local socket_out
socket_out="$("$BANGER" daemon socket)" || die 'daemon socket: command failed'
[[ "$socket_out" == "/run/banger/bangerd.sock" ]] \
|| die "daemon socket: got '$socket_out', want '/run/banger/bangerd.sock'"
local mig_out
mig_out="$("$BANGERD" --system --check-migrations)" \
|| die "bangerd --check-migrations: non-zero exit (out: $mig_out)"
grep -q '^compatible:' <<<"$mig_out" \
|| die "bangerd --check-migrations: stdout missing 'compatible:' prefix; got: $mig_out"
if ! sudo -n true 2>/dev/null; then
log 'daemon_admin: passwordless sudo unavailable; skipping daemon stop assertion'
return 0
fi
sudo_banger "$BANGER" daemon stop >/dev/null || die 'banger daemon stop: command failed'
local status_out
status_out="$("$BANGER" system status 2>/dev/null || true)"
grep -qE '^active +inactive' <<<"$status_out" \
|| die "owner daemon still active after daemon stop: $status_out"
grep -qE '^helper_active +inactive' <<<"$status_out" \
|| die "root helper still active after daemon stop: $status_out"
}
# ---------------------------------------------------------------------
# Dispatchers.
# ---------------------------------------------------------------------