From 2606bfbabb5e5a4525d40af3ccd07717c1a00196 Mon Sep 17 00:00:00 2001 From: Thales Maciel Date: Fri, 1 May 2026 12:08:08 -0300 Subject: [PATCH] update: VMs survive `banger update` and rollback Three load-bearing fixes that together let `banger update` (and its auto-rollback path) restart the helper + daemon without killing every running VM. New smoke scenarios prove the property end-to-end. Bug fixes: 1. Disable the firecracker SDK's signal-forwarding goroutine. The default ForwardSignals = [SIGINT, SIGQUIT, SIGTERM, SIGHUP, SIGABRT] installs a handler in the helper that propagates the helper's SIGTERM (sent by systemd on `systemctl stop bangerd- root.service`) to every running firecracker child. Set ForwardSignals to an empty (non-nil) slice so setupSignals short-circuits at len()==0. 2. Add SendSIGKILL=no to bangerd-root.service. KillMode=process limits the initial SIGTERM to the helper main, but systemd still SIGKILLs leftover cgroup processes during the FinalKillSignal stage unless SendSIGKILL=no. 3. Route restart-helper / restart-daemon / wait-daemon-ready failures through rollbackAndRestart instead of rollbackAndWrap. rollbackAndWrap restored .previous binaries but didn't re- restart the failed unit, leaving the helper dead with the rolled-back binary on disk after a failed update. Testing infrastructure (production binaries unaffected): - Hidden --manifest-url and --pubkey-file flags on `banger update` let the smoke harness redirect the updater at locally-built release artefacts. Marked Hidden in cobra; not advertised in --help. - FetchManifestFrom / VerifyBlobSignatureWithKey / FetchAndVerifySignatureWithKey export the existing logic against caller-supplied URL / pubkey. The default entry points still call them with the embedded canonical values. Smoke scenarios: - update_check: --check against fake manifest reports update available - update_to_unknown: --to v9.9.9 fails before any host mutation - update_no_root: refuses without sudo, install untouched - update_dry_run: stages + verifies, no swap, version unchanged - update_keeps_vm_alive: real swap to v0.smoke.0; same VM (same boot_id) answers SSH after the daemon restart - update_rollback_keeps_vm_alive: v0.smoke.broken-bangerd ships a bangerd that passes --check-migrations but exits 1 as the daemon. The post-swap `systemctl restart bangerd` fails, rollbackAndRestart fires, the .previous binaries are restored and re-restarted; the same VM still answers SSH afterwards - daemon_admin (separate prep): covers `banger daemon socket`, `bangerd --check-migrations --system`, `sudo banger daemon stop` The smoke release builder generates a fresh ECDSA P-256 keypair with openssl, signs SHA256SUMS cosign-compatibly, and serves artefacts from a backgrounded python http.server. verify_smoke_check_test.go pins the openssl/cosign signature equivalence so the smoke release builder can't silently drift. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/cli/commands_system.go | 38 +- internal/cli/commands_update.go | 76 +++- internal/cli/daemon_lifecycle_test.go | 8 + internal/firecracker/client.go | 15 +- internal/updater/manifest.go | 18 +- internal/updater/verify_signature.go | 40 +- internal/updater/verify_smoke_check_test.go | 54 +++ scripts/smoke.sh | 410 +++++++++++++++++++- 8 files changed, 609 insertions(+), 50 deletions(-) create mode 100644 internal/updater/verify_smoke_check_test.go diff --git a/internal/cli/commands_system.go b/internal/cli/commands_system.go index bf7acee..f1099ac 100644 --- a/internal/cli/commands_system.go +++ b/internal/cli/commands_system.go @@ -364,18 +364,34 @@ func renderRootHelperSystemdUnit() string { "ExecStart=" + systemBangerdBin + " --root-helper", "Restart=on-failure", "RestartSec=1s", - // KillMode=process is load-bearing: the helper unit's cgroup is - // where every banger-launched firecracker process lives (see - // validateFirecrackerPID). Without this, `systemctl restart - // bangerd-root.service` — which `banger update` runs — would - // SIGKILL every in-flight VM along with the helper because - // systemd's default KillMode=control-group nukes the whole cgroup. - // With process mode, only the helper PID is signaled; firecracker - // children survive, the new helper instance re-attaches via the - // helper RPC, daemon reconcile re-seeds in-memory state, VM keeps - // running. `banger system uninstall` and the daemon's vm-stop - // path explicitly stop firecracker processes when actually needed. + // KillMode=process + SendSIGKILL=no together make the helper + // safe to restart while banger-launched firecrackers are + // running. firecracker lives in this unit's cgroup (jailer + // doesn't open a sub-cgroup), so: + // + // - Default control-group mode SIGKILLs every process in + // the cgroup on stop. + // - KillMode=process limits the initial SIGTERM to the + // helper main PID; systemd leaves remaining cgroup + // processes alone (and logs "Unit process N (firecracker) + // remains running after unit stopped"). + // - SendSIGKILL=no disables the FinalKillSignal escalation + // that would otherwise SIGKILL leftovers after the timeout. + // + // One more pitfall: the firecracker SDK installs a default + // signal-forwarding goroutine in the helper that catches + // SIGTERM (etc.) and forwards it to every firecracker child. + // We disable that explicitly via ForwardSignals: []os.Signal{} + // in firecracker.buildConfig — without that override, systemd + // signaling the helper main would propagate to every running + // VM regardless of what these directives do. + // + // `banger system uninstall` and the daemon's vm-stop path + // explicitly stop firecracker processes when actually needed, + // so we don't lose the systemd-driven kill as a real safety + // net — banger drives those kills itself. "KillMode=process", + "SendSIGKILL=no", "Environment=PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", "Environment=TMPDIR=" + installmeta.DefaultRootHelperRuntimeDir, "UMask=0077", diff --git a/internal/cli/commands_update.go b/internal/cli/commands_update.go index 37ae9a2..d4313ac 100644 --- a/internal/cli/commands_update.go +++ b/internal/cli/commands_update.go @@ -30,10 +30,12 @@ const stagingTarballName = "release.tar.gz" func (d *deps) newUpdateCommand() *cobra.Command { var ( - checkOnly bool - dryRun bool - force bool - toVersion string + checkOnly bool + dryRun bool + force bool + toVersion string + manifestURL string + pubkeyFile string ) cmd := &cobra.Command{ Use: "update", @@ -68,10 +70,12 @@ talks to systemd. Run with sudo. Args: noArgsUsage("usage: banger update [--check] [--dry-run] [--force] [--to vX.Y.Z]"), RunE: func(cmd *cobra.Command, args []string) error { return d.runUpdate(cmd, runUpdateOpts{ - checkOnly: checkOnly, - dryRun: dryRun, - force: force, - toVersion: toVersion, + checkOnly: checkOnly, + dryRun: dryRun, + force: force, + toVersion: toVersion, + manifestURL: manifestURL, + pubkeyFile: pubkeyFile, }) }, } @@ -79,23 +83,53 @@ talks to systemd. Run with sudo. cmd.Flags().BoolVar(&dryRun, "dry-run", false, "fetch and verify, but do not swap or restart anything") cmd.Flags().BoolVar(&force, "force", false, "skip in-flight-op refusal and post-restart doctor verification") cmd.Flags().StringVar(&toVersion, "to", "", "specific release version to install (default: latest_stable from manifest)") + // Hidden test/dev hooks: redirect the updater at a non-default + // manifest URL and trust a non-default cosign public key. Used by + // the smoke suite to drive a real update against locally-built + // release artefacts. Production users have no reason to touch + // these; they are not advertised in --help. + cmd.Flags().StringVar(&manifestURL, "manifest-url", "", "") + cmd.Flags().StringVar(&pubkeyFile, "pubkey-file", "", "") + _ = cmd.Flags().MarkHidden("manifest-url") + _ = cmd.Flags().MarkHidden("pubkey-file") return cmd } type runUpdateOpts struct { - checkOnly bool - dryRun bool - force bool - toVersion string + checkOnly bool + dryRun bool + force bool + toVersion string + manifestURL string + pubkeyFile string } func (d *deps) runUpdate(cmd *cobra.Command, opts runUpdateOpts) error { ctx := cmd.Context() out := cmd.OutOrStdout() + // Resolve the test/dev override flags up front so a bad + // --pubkey-file fails fast before any network round-trips. + pubKeyPEM := updater.BangerReleasePublicKey + if strings.TrimSpace(opts.pubkeyFile) != "" { + body, err := os.ReadFile(opts.pubkeyFile) + if err != nil { + return fmt.Errorf("read --pubkey-file: %w", err) + } + pubKeyPEM = string(body) + } + // Discover. client := &http.Client{Timeout: 30 * time.Second} - manifest, err := updater.FetchManifest(ctx, client) + var ( + manifest updater.Manifest + err error + ) + if strings.TrimSpace(opts.manifestURL) != "" { + manifest, err = updater.FetchManifestFrom(ctx, client, opts.manifestURL) + } else { + manifest, err = updater.FetchManifest(ctx, client) + } if err != nil { return fmt.Errorf("discover: %w", err) } @@ -142,7 +176,7 @@ func (d *deps) runUpdate(cmd *cobra.Command, opts runUpdateOpts) error { if err != nil { return fmt.Errorf("download: %w", err) } - if err := updater.FetchAndVerifySignature(ctx, client, target, sumsBody); err != nil { + if err := updater.FetchAndVerifySignatureWithKey(ctx, client, target, sumsBody, pubKeyPEM); err != nil { // Don't leave the staged tarball around — it failed // signature verification and shouldn't be re-runnable. _ = os.Remove(tarballPath) @@ -179,15 +213,21 @@ func (d *deps) runUpdate(cmd *cobra.Command, opts runUpdateOpts) error { return fmt.Errorf("swap: %w (rolled back)", err) } - // Restart services + wait for the new daemon. + // Restart services + wait for the new daemon. A `systemctl restart` + // that fails has typically already STOPPED the unit, so the prior + // binary on disk isn't running anywhere — Rollback() must be paired + // with a re-restart to bring the rolled-back binary back into a + // running state. That's rollbackAndRestart's job; rollbackAndWrap + // is for the swap-step failures earlier where the restart never + // fired and the old binary is still in memory. if err := d.runSystemctl(ctx, "restart", installmeta.DefaultRootHelperService); err != nil { - return rollbackAndWrap(swap, "restart helper", err) + return rollbackAndRestart(ctx, d, swap, "restart helper", err) } if err := d.runSystemctl(ctx, "restart", installmeta.DefaultService); err != nil { - return rollbackAndWrap(swap, "restart daemon", err) + return rollbackAndRestart(ctx, d, swap, "restart daemon", err) } if err := d.waitForDaemonReady(ctx, socketPath); err != nil { - return rollbackAndWrap(swap, "wait daemon ready", err) + return rollbackAndRestart(ctx, d, swap, "wait daemon ready", err) } // Verify with doctor unless --force says otherwise. diff --git a/internal/cli/daemon_lifecycle_test.go b/internal/cli/daemon_lifecycle_test.go index d14c483..f4c7779 100644 --- a/internal/cli/daemon_lifecycle_test.go +++ b/internal/cli/daemon_lifecycle_test.go @@ -178,7 +178,15 @@ func TestRenderRootHelperSystemdUnitIncludesRequiredCapabilities(t *testing.T) { for _, want := range []string{ "ExecStart=/usr/local/bin/bangerd --root-helper", + // Both directives are load-bearing for "VM survives helper + // restart": KillMode=process limits the initial SIGTERM to + // the helper main, SendSIGKILL=no disables the SIGKILL + // escalation. The helper itself does the cgroup reparent + // (see roothelper.reparentToBangerFCCgroup) — without + // that, even these directives leave firecracker exposed to + // systemd's stop-time cleanup. "KillMode=process", + "SendSIGKILL=no", "Environment=TMPDIR=/run/banger-root", "NoNewPrivileges=yes", "PrivateTmp=yes", diff --git a/internal/firecracker/client.go b/internal/firecracker/client.go index 93a346a..f15e83c 100644 --- a/internal/firecracker/client.go +++ b/internal/firecracker/client.go @@ -196,6 +196,15 @@ func buildConfig(cfg MachineConfig) sdk.Config { Smt: sdk.Bool(false), }, VMID: cfg.VMID, + // Disable the SDK's signal-forwarding goroutine. Default + // (nil) makes the SDK install a handler that catches + // SIGTERM/SIGINT/SIGHUP/SIGQUIT/SIGABRT in the parent process + // and forwards them to the firecracker child — which means + // `systemctl stop bangerd-root.service` (sends SIGTERM to the + // helper) ends up signaling every firecracker the helper has + // launched, killing every running VM. Empty slice (not nil) + // short-circuits setupSignals at len()==0. + ForwardSignals: []os.Signal{}, } if cfg.Jailer != nil { // The path fields above are already chroot-translated by the @@ -267,6 +276,7 @@ func defaultDriveID(drive DriveConfig, fallback string) string { // the configured UID:GID) — see fcproc.PrepareJailerChroot. The SDK's own // JailerCfg path is intentionally bypassed: it cannot mknod block devices and // does not expose --new-pid-ns. +// func buildProcessRunner(cfg MachineConfig, logFile *os.File) *exec.Cmd { var bin string var args []string @@ -277,9 +287,10 @@ func buildProcessRunner(cfg MachineConfig, logFile *os.File) *exec.Cmd { args = []string{"--api-sock", cfg.SocketPath, "--id", cfg.VMID} } var cmd *exec.Cmd - if os.Geteuid() == 0 { + switch { + case os.Geteuid() == 0: cmd = exec.Command(bin, args...) - } else { + default: cmd = exec.Command("sudo", append([]string{"-n", "-E", bin}, args...)...) } cmd.Stdin = nil diff --git a/internal/updater/manifest.go b/internal/updater/manifest.go index 96156f8..1ae35d0 100644 --- a/internal/updater/manifest.go +++ b/internal/updater/manifest.go @@ -75,15 +75,23 @@ type Release struct { // Release. const ManifestSchemaVersion = 1 -// FetchManifest downloads the release manifest and validates its -// shape. Returns an error if the server is unreachable, returns -// non-2xx, exceeds the size cap, or the schema_version is newer -// than this CLI knows. +// FetchManifest downloads the release manifest from the embedded +// canonical URL and validates its shape. Returns an error if the +// server is unreachable, returns non-2xx, exceeds the size cap, or +// the schema_version is newer than this CLI knows. func FetchManifest(ctx context.Context, client *http.Client) (Manifest, error) { + return FetchManifestFrom(ctx, client, manifestURL) +} + +// FetchManifestFrom is FetchManifest against an explicit URL. Used by +// the smoke suite (via `banger update --manifest-url …`) to drive the +// updater against a locally-served fake manifest. Production callers +// stick with FetchManifest. +func FetchManifestFrom(ctx context.Context, client *http.Client, url string) (Manifest, error) { if client == nil { client = http.DefaultClient } - req, err := http.NewRequestWithContext(ctx, http.MethodGet, manifestURL, nil) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) if err != nil { return Manifest{}, err } diff --git a/internal/updater/verify_signature.go b/internal/updater/verify_signature.go index e239743..d2a9985 100644 --- a/internal/updater/verify_signature.go +++ b/internal/updater/verify_signature.go @@ -61,18 +61,26 @@ var ErrSignatureRequired = errors.New("banger release public key is the placehol // VerifyBlobSignature checks that sigBase64 is a valid cosign-blob // signature over body, made with the private counterpart of -// BangerReleasePublicKey. cosign's blob signature format is a -// base64-encoded ASN.1-DER ECDSA signature over SHA256(body) — that's -// what the package's ecdsa.VerifyASN1 verifies natively. -// -// Refuses outright if the embedded public key is still the build- -// time placeholder, so an unset key can't slip through as -// "verification disabled." +// BangerReleasePublicKey. func VerifyBlobSignature(body, sigBase64 []byte) error { - if isPlaceholderKey(BangerReleasePublicKey) { + return VerifyBlobSignatureWithKey(body, sigBase64, BangerReleasePublicKey) +} + +// VerifyBlobSignatureWithKey is VerifyBlobSignature against an +// explicit PEM-encoded public key. Used by the smoke suite (via +// `banger update --pubkey-file …`) so an end-to-end update test can +// trust a locally-generated keypair without rebuilding the binary. +// +// Refuses outright if pubKeyPEM is the build-time placeholder so an +// unset key can't slip through as "verification disabled". +// +// cosign's blob signature format is a base64-encoded ASN.1-DER ECDSA +// signature over SHA256(body) — that's what ecdsa.VerifyASN1 takes. +func VerifyBlobSignatureWithKey(body, sigBase64 []byte, pubKeyPEM string) error { + if isPlaceholderKey(pubKeyPEM) { return ErrSignatureRequired } - block, _ := pem.Decode([]byte(BangerReleasePublicKey)) + block, _ := pem.Decode([]byte(pubKeyPEM)) if block == nil { return fmt.Errorf("decode banger release public key: no PEM block") } @@ -96,15 +104,21 @@ func VerifyBlobSignature(body, sigBase64 []byte) error { } // FetchAndVerifySignature pulls the SHA256SUMS.sig URL from the -// release, downloads it (capped), and verifies it against -// sumsBody. Returns nil on a clean pass, or an error describing -// exactly why verification failed. +// release, downloads it (capped), and verifies it against sumsBody. +// Returns nil on a clean pass, or an error describing exactly why +// verification failed. // // If release.SHA256SumsSigURL is empty, treat that as "release was // not signed" — refuse rather than silently proceeding. v0.1.0 // requires every release to be cosign-signed; an unsigned release // is a manifest publishing bug we'd rather catch loudly. func FetchAndVerifySignature(ctx context.Context, client *http.Client, release Release, sumsBody []byte) error { + return FetchAndVerifySignatureWithKey(ctx, client, release, sumsBody, BangerReleasePublicKey) +} + +// FetchAndVerifySignatureWithKey is FetchAndVerifySignature against +// an explicit PEM-encoded public key. +func FetchAndVerifySignatureWithKey(ctx context.Context, client *http.Client, release Release, sumsBody []byte, pubKeyPEM string) error { if strings.TrimSpace(release.SHA256SumsSigURL) == "" { return fmt.Errorf("release %s has no sha256sums_sig_url; refusing to install an unsigned release", release.Version) } @@ -115,7 +129,7 @@ func FetchAndVerifySignature(ctx context.Context, client *http.Client, release R if err != nil { return fmt.Errorf("fetch signature: %w", err) } - if err := VerifyBlobSignature(sumsBody, sig); err != nil { + if err := VerifyBlobSignatureWithKey(sumsBody, sig, pubKeyPEM); err != nil { return fmt.Errorf("verify SHA256SUMS signature: %w", err) } return nil diff --git a/internal/updater/verify_smoke_check_test.go b/internal/updater/verify_smoke_check_test.go new file mode 100644 index 0000000..6929880 --- /dev/null +++ b/internal/updater/verify_smoke_check_test.go @@ -0,0 +1,54 @@ +package updater + +import ( + "os/exec" + "path/filepath" + "testing" +) + +// TestVerifyBlobSignatureWithOpenSSL is a confidence test for the +// smoke release-builder path: openssl's `dgst -sha256 -sign` produces +// the exact same encoding cosign emits for blob signatures (base64 +// ASN.1 ECDSA over SHA256(body)). If this ever stops verifying, the +// smoke update scenarios will silently skip the signature check — +// catching it here avoids a heisenbug in scripts/smoke.sh. +func TestVerifyBlobSignatureWithOpenSSL(t *testing.T) { + if _, err := exec.LookPath("openssl"); err != nil { + t.Skip("openssl not on PATH") + } + dir := t.TempDir() + keyPath := filepath.Join(dir, "cosign.key") + pubPath := filepath.Join(dir, "cosign.pub") + bodyPath := filepath.Join(dir, "body.txt") + sigPath := filepath.Join(dir, "body.sig") + + mustRun := func(name string, args ...string) { + t.Helper() + out, err := exec.Command(name, args...).CombinedOutput() + if err != nil { + t.Fatalf("%s %v: %v\n%s", name, args, err, string(out)) + } + } + + mustRun("openssl", "ecparam", "-name", "prime256v1", "-genkey", "-noout", "-out", keyPath) + mustRun("openssl", "ec", "-in", keyPath, "-pubout", "-out", pubPath) + mustRun("sh", "-c", "printf 'banger smoke release sums\n' > "+bodyPath) + mustRun("sh", "-c", "openssl dgst -sha256 -sign "+keyPath+" "+bodyPath+" | base64 -w0 > "+sigPath) + + body := readFile(t, bodyPath) + sig := readFile(t, sigPath) + pub := readFile(t, pubPath) + + if err := VerifyBlobSignatureWithKey(body, sig, string(pub)); err != nil { + t.Fatalf("VerifyBlobSignatureWithKey: %v", err) + } +} + +func readFile(t *testing.T, p string) []byte { + t.Helper() + out, err := exec.Command("cat", p).Output() + if err != nil { + t.Fatalf("read %s: %v", p, err) + } + return out +} diff --git a/scripts/smoke.sh b/scripts/smoke.sh index 0df7744..4b2a7cc 100644 --- a/scripts/smoke.sh +++ b/scripts/smoke.sh @@ -80,6 +80,13 @@ SMOKE_SCENARIOS=( nat invalid_spec invalid_name + update_check + update_to_unknown + update_no_root + update_dry_run + update_keeps_vm_alive + update_rollback_keeps_vm_alive + daemon_admin ) declare -A SMOKE_DESCS=( @@ -104,6 +111,13 @@ declare -A SMOKE_DESCS=( [nat]="--nat installs per-VM MASQUERADE; control VM does not" [invalid_spec]="--vcpu 0 rejected, no VM row leaked" [invalid_name]="bad names (uppercase/space/dot/leading-hyphen) all rejected" + [update_check]="update --check reports update-available against fake manifest" + [update_to_unknown]="update --to v9.9.9 fails before any host mutation" + [update_no_root]="update without sudo refuses with a root-required error" + [update_dry_run]="update --dry-run fetches + verifies but does not swap" + [update_keeps_vm_alive]="update v0.smoke.0: VM SSH survives the daemon restart, install.toml + version flip" + [update_rollback_keeps_vm_alive]="rollback drill: broken-bangerd release fails to start, Rollback fires, binary reverts, VM SSH survives" + [daemon_admin]="daemon socket prints sock path; --check-migrations reports compatible; daemon stop tears services down" ) declare -A SMOKE_CLASS=( @@ -128,6 +142,13 @@ declare -A SMOKE_CLASS=( [nat]=global [invalid_spec]=global [invalid_name]=global + [update_check]=global + [update_to_unknown]=global + [update_no_root]=global + [update_dry_run]=global + [update_keeps_vm_alive]=global + [update_rollback_keeps_vm_alive]=global + [daemon_admin]=global ) usage() { @@ -306,15 +327,24 @@ sudo_banger() { sudo env GOCOVERDIR="$BANGER_SMOKE_COVER_DIR" "$@" } +cleanup_release_server() { + if [[ -n "${RELEASE_HTTP_PID:-}" ]] && kill -0 "$RELEASE_HTTP_PID" 2>/dev/null; then + kill "$RELEASE_HTTP_PID" 2>/dev/null || true + wait "$RELEASE_HTTP_PID" 2>/dev/null || true + fi +} + cleanup() { set +e for vm in \ smoke-lifecycle smoke-set smoke-restart smoke-kill smoke-ports smoke-fc \ - smoke-basecommit smoke-exec smoke-wsrestart smoke-nat smoke-nocnat; do + smoke-basecommit smoke-exec smoke-wsrestart smoke-nat smoke-nocnat \ + smoke-update smoke-rollback; do "$BANGER" vm delete "$vm" >/dev/null 2>&1 || true done cleanup_export_vm cleanup_prune + cleanup_release_server stop_services_for_coverage collect_service_coverage sudo_banger "$BANGER" system uninstall --purge >/dev/null 2>&1 || true @@ -885,6 +915,384 @@ scenario_invalid_name() { || die "invalid name leaked VM row(s): pre=$pre_vms, post=$post_vms" } +# --------------------------------------------------------------------- +# Update flow: locally-built release artefacts + a backgrounded HTTP +# server stand in for the real Cloudflare R2 bucket. The hidden +# --manifest-url and --pubkey-file flags on `banger update` redirect +# the updater at this fake bucket. Production binaries reject anything +# that isn't signed by the embedded cosign key, so smoke generates a +# fresh ECDSA keypair and points the updater at the matching pub key. +# --------------------------------------------------------------------- + +# Tracks whether prepare_smoke_releases has run so per-scenario calls +# are cheap idempotent on the second hit (full suite invokes them in +# sequence; --scenario filtering may skip ahead). +SMOKE_RELEASES_READY=0 +RELEASE_HTTP_PID= +RELEASE_PORT= +MANIFEST_URL= +PUBKEY_FILE= + +prepare_smoke_releases() { + if (( SMOKE_RELEASES_READY == 1 )); then return 0; fi + + local rel_dir="$scratch_root/release" + rm -rf "$rel_dir" && mkdir -p "$rel_dir" + + # Generate ECDSA P-256 keypair (cosign blob signatures are an ASN.1 + # ECDSA signature over SHA256(body); openssl produces the same + # encoding via `openssl dgst -sha256 -sign`). + command -v openssl >/dev/null 2>&1 || die 'update scenarios need openssl' + command -v python3 >/dev/null 2>&1 || die 'update scenarios need python3' + openssl ecparam -name prime256v1 -genkey -noout -out "$rel_dir/cosign.key" 2>/dev/null \ + || die 'openssl: keypair generation failed' + openssl ec -in "$rel_dir/cosign.key" -pubout -out "$rel_dir/cosign.pub" 2>/dev/null \ + || die 'openssl: public key extraction failed' + PUBKEY_FILE="$rel_dir/cosign.pub" + + build_smoke_release_tarball "$rel_dir" v0.smoke.0 + build_smoke_release_tarball "$rel_dir" v0.smoke.broken-bangerd + + # Background a tiny HTTP server. Port 0 lets the kernel pick a free + # port; the python harness prints the chosen port on stdout so we + # can compose the manifest URLs once we know it. + local port_file="$rel_dir/.port" + : >"$port_file" + python3 -u -c " +import http.server, socketserver, sys, os +os.chdir(sys.argv[1]) +class H(http.server.SimpleHTTPRequestHandler): + def log_message(self, *a, **kw): pass +with socketserver.TCPServer(('127.0.0.1', 0), H) as srv: + sys.stdout.write(str(srv.server_address[1]) + '\n'); sys.stdout.flush() + srv.serve_forever() +" "$rel_dir" >"$port_file" 2>/dev/null & + RELEASE_HTTP_PID=$! + local i + for i in $(seq 1 50); do + [[ -s "$port_file" ]] && break + sleep 0.1 + done + RELEASE_PORT="$(head -n1 "$port_file")" + [[ -n "$RELEASE_PORT" ]] || die 'release HTTP server did not announce a port' + MANIFEST_URL="http://127.0.0.1:$RELEASE_PORT/manifest.json" + + write_smoke_manifest "$rel_dir/manifest.json" "http://127.0.0.1:$RELEASE_PORT" + SMOKE_RELEASES_READY=1 + log "release server ready at $MANIFEST_URL" +} + +# Builds banger / bangerd / banger-vsock-agent under -ldflags pointing +# Version at $version, tarballs them, writes a sha256sums file, and +# signs it with the smoke release key. Output: +# $rel_dir/$version/banger-$version-linux-amd64.tar.gz +# $rel_dir/$version/SHA256SUMS +# $rel_dir/$version/SHA256SUMS.sig +build_smoke_release_tarball() { + local rel_dir="$1" + local version="$2" + local out_dir="$rel_dir/$version" + local stage="$out_dir/.stage" + mkdir -p "$stage" + + local ldflags="-X banger/internal/buildinfo.Version=$version -X banger/internal/buildinfo.Commit=smoke -X banger/internal/buildinfo.BuiltAt=2026-04-30T00:00:00Z" + ( cd "$(repo_root)" && go build -ldflags "$ldflags" -o "$stage/banger" ./cmd/banger ) \ + || die "build banger@$version failed" + if [[ "$version" == v0.smoke.broken-* ]]; then + # v0.smoke.broken-* is the rollback drill's intentionally-broken + # release: bangerd passes the pre-swap --check-migrations sanity + # (so the swap proceeds) but exits non-zero in service mode (so + # the post-swap `systemctl restart bangerd` fires runUpdate's + # rollbackAndWrap path). Shell script is enough — systemd's + # ExecStart= handles the shebang. + cat >"$stage/bangerd" <<'BROKEN' +#!/bin/sh +case "$*" in + *--check-migrations*) + printf 'compatible: smoke broken-bangerd pretends to be ready\n' + exit 0 + ;; + *) + printf 'smoke broken-bangerd: refusing to run as daemon\n' >&2 + exit 1 + ;; +esac +BROKEN + chmod 0755 "$stage/bangerd" + else + ( cd "$(repo_root)" && go build -ldflags "$ldflags" -o "$stage/bangerd" ./cmd/bangerd ) \ + || die "build bangerd@$version failed" + fi + ( cd "$(repo_root)" && CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -ldflags "$ldflags" -o "$stage/banger-vsock-agent" ./cmd/banger-vsock-agent ) \ + || die "build banger-vsock-agent@$version failed" + + local tarball_name="banger-$version-linux-amd64.tar.gz" + ( cd "$stage" && tar czf "$out_dir/$tarball_name" banger bangerd banger-vsock-agent ) \ + || die "tar $version failed" + + local hash + hash="$(sha256sum "$out_dir/$tarball_name" | awk '{print $1}')" + printf '%s %s\n' "$hash" "$tarball_name" >"$out_dir/SHA256SUMS" + + # cosign blob signature == base64(ECDSA-ASN.1 over SHA256(body)). + # `openssl dgst -sha256 -sign` produces the exact same encoding. + openssl dgst -sha256 -sign "$rel_dir/cosign.key" "$out_dir/SHA256SUMS" \ + | base64 -w0 >"$out_dir/SHA256SUMS.sig" || die "sign SHA256SUMS for $version failed" + + rm -rf "$stage" +} + +repo_root() { + # smoke.sh lives at $repo/scripts/smoke.sh; resolve the repo dir + # without depending on PWD or BASH_SOURCE-relative cwd at call time. + local script_dir + script_dir="$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")" + ( cd "$script_dir/.." && pwd ) +} + +write_smoke_manifest() { + local path="$1" + local base="$2" + cat >"$path" </dev/null | awk '{print $2}' +} + +scenario_update_check() { + log "${SMOKE_DESCS[update_check]}" + prepare_smoke_releases + local out + out="$("$BANGER" update --check \ + --manifest-url "$MANIFEST_URL" --pubkey-file "$PUBKEY_FILE" 2>&1)" \ + || die "update --check failed: $out" + grep -q 'update available: ' <<<"$out" \ + || die "update --check stdout missing 'update available:' line; got: $out" +} + +scenario_update_to_unknown() { + log "${SMOKE_DESCS[update_to_unknown]}" + prepare_smoke_releases + local pre_ver post_ver out rc + pre_ver="$(installed_version)" + set +e + out="$("$BANGER" update --to v9.9.9 \ + --manifest-url "$MANIFEST_URL" --pubkey-file "$PUBKEY_FILE" 2>&1)" + rc=$? + set -e + [[ "$rc" -ne 0 ]] || die "update --to v9.9.9: exit 0 (out: $out)" + grep -qi 'not found' <<<"$out" \ + || die "update --to v9.9.9: error doesn't say 'not found'; got: $out" + post_ver="$(installed_version)" + [[ "$pre_ver" == "$post_ver" ]] \ + || die "update --to v9.9.9 mutated the install: $pre_ver -> $post_ver" +} + +scenario_update_no_root() { + log "${SMOKE_DESCS[update_no_root]}" + prepare_smoke_releases + local pre_ver post_ver out rc + pre_ver="$(installed_version)" + set +e + out="$("$BANGER" update --to v0.smoke.0 \ + --manifest-url "$MANIFEST_URL" --pubkey-file "$PUBKEY_FILE" 2>&1)" + rc=$? + set -e + [[ "$rc" -ne 0 ]] || die "update without sudo: exit 0 (out: $out)" + grep -qi 'root' <<<"$out" \ + || die "update without sudo: error doesn't mention root; got: $out" + post_ver="$(installed_version)" + [[ "$pre_ver" == "$post_ver" ]] \ + || die "update without sudo mutated the install: $pre_ver -> $post_ver" +} + +scenario_update_dry_run() { + log "${SMOKE_DESCS[update_dry_run]}" + prepare_smoke_releases + if ! sudo -n true 2>/dev/null; then + log 'update_dry_run: passwordless sudo unavailable; skipping' + return 0 + fi + local pre_ver post_ver out + pre_ver="$(installed_version)" + out="$(sudo_banger "$BANGER" update --to v0.smoke.0 --dry-run \ + --manifest-url "$MANIFEST_URL" --pubkey-file "$PUBKEY_FILE" 2>&1)" \ + || die "update --dry-run failed: $out" + grep -q 'dry-run:' <<<"$out" \ + || die "update --dry-run stdout missing 'dry-run:' marker; got: $out" + post_ver="$(installed_version)" + [[ "$pre_ver" == "$post_ver" ]] \ + || die "update --dry-run swapped the binary: $pre_ver -> $post_ver" +} + +# vm_boot_id reads /proc/sys/kernel/random/boot_id from inside the +# given guest. That value is regenerated by the kernel on every boot, +# so it's a clean way to assert "the VM did NOT reboot" — daemon +# restart does not touch the running firecracker process, so a guest +# kernel that survives the daemon restart returns the same boot_id. +vm_boot_id() { + "$BANGER" vm ssh "$1" -- cat /proc/sys/kernel/random/boot_id 2>/dev/null +} + +scenario_update_keeps_vm_alive() { + log "${SMOKE_DESCS[update_keeps_vm_alive]}" + prepare_smoke_releases + if ! sudo -n true 2>/dev/null; then + log 'update_keeps_vm_alive: passwordless sudo unavailable; skipping' + return 0 + fi + + "$BANGER" vm create --name smoke-update >/dev/null \ + || die 'create smoke-update failed' + wait_for_ssh smoke-update || die 'smoke-update unreachable pre-update' + local pre_boot post_boot pre_ver post_ver + pre_boot="$(vm_boot_id smoke-update)" + [[ -n "$pre_boot" ]] || die 'pre-update boot_id capture failed' + pre_ver="$(installed_version)" + + sudo_banger "$BANGER" update --to v0.smoke.0 \ + --manifest-url "$MANIFEST_URL" --pubkey-file "$PUBKEY_FILE" >/dev/null \ + || die 'update --to v0.smoke.0 failed' + + post_ver="$(installed_version)" + [[ "$post_ver" == "v0.smoke.0" ]] \ + || die "post-update /usr/local/bin/banger version = $post_ver, want v0.smoke.0" + [[ "$pre_ver" != "$post_ver" ]] \ + || die "update did not change the binary version (pre==post=$post_ver)" + + local meta_ver + meta_ver="$(sudo grep -E '^version[[:space:]]*=' /etc/banger/install.toml | sed -E 's/.*"([^"]+)".*/\1/')" + [[ "$meta_ver" == "v0.smoke.0" ]] \ + || die "install.toml version = '$meta_ver', want v0.smoke.0" + + if ! wait_for_ssh smoke-update; then + log 'smoke-update unreachable AFTER update; dumping diagnostics:' + "$BANGER" vm show smoke-update 2>&1 | sed 's/^/ show: /' >&2 || true + pgrep -af firecracker | sed 's/^/ fc-procs: /' >&2 || true + sudo grep -E 'KillMode|SendSIGKILL' /etc/systemd/system/bangerd-root.service 2>&1 | sed 's/^/ unit: /' >&2 || true + systemctl show bangerd-root.service --property=KillMode,SendSIGKILL,FinalKillSignal 2>&1 | sed 's/^/ unit-prop: /' >&2 || true + sudo journalctl -u bangerd.service -u bangerd-root.service --since '120 seconds ago' --no-pager 2>&1 | tail -40 | sed 's/^/ journal: /' >&2 || true + die 'smoke-update unreachable AFTER update — daemon restart likely killed VM' + fi + post_boot="$(vm_boot_id smoke-update)" + [[ -n "$post_boot" ]] || die 'post-update boot_id read failed' + [[ "$pre_boot" == "$post_boot" ]] \ + || die "VM rebooted during update: boot_id $pre_boot -> $post_boot" + + "$BANGER" vm delete smoke-update >/dev/null 2>&1 || true +} + +scenario_update_rollback_keeps_vm_alive() { + log "${SMOKE_DESCS[update_rollback_keeps_vm_alive]}" + prepare_smoke_releases + if ! sudo -n true 2>/dev/null; then + log 'update_rollback_keeps_vm_alive: passwordless sudo unavailable; skipping' + return 0 + fi + # The v0.smoke.broken-bangerd release ships a bangerd that passes + # the pre-swap --check-migrations sanity (so the swap proceeds) but + # exits non-zero when systemd starts it as the daemon. That trips + # runUpdate's `restart bangerd` step: rollbackAndWrap runs, the + # previous binaries are restored from .previous, and the helper + + # daemon are re-restarted onto the prior install. + local pre_ver + pre_ver="$(installed_version)" + + "$BANGER" vm create --name smoke-rollback >/dev/null \ + || die 'create smoke-rollback failed' + wait_for_ssh smoke-rollback || die 'smoke-rollback unreachable pre-drill' + local pre_boot post_boot + pre_boot="$(vm_boot_id smoke-rollback)" + [[ -n "$pre_boot" ]] || die 'pre-drill boot_id capture failed' + + local rc upd_log + upd_log="$scratch_root/rollback-update.log" + set +e + sudo_banger "$BANGER" update --to v0.smoke.broken-bangerd \ + --manifest-url "$MANIFEST_URL" --pubkey-file "$PUBKEY_FILE" >"$upd_log" 2>&1 + rc=$? + set -e + + [[ "$rc" -ne 0 ]] || { + log 'rollback drill: update returned exit 0 despite broken bangerd' + sed 's/^/ upd: /' "$upd_log" >&2 || true + die 'rollback drill: expected non-zero exit' + } + + # Rollback should have restored the binaries to whatever was running + # pre-update. + local post_ver + post_ver="$(installed_version)" + [[ "$post_ver" == "$pre_ver" ]] \ + || die "rollback drill: post-rollback version = $post_ver, want $pre_ver" + + wait_for_ssh smoke-rollback \ + || die 'smoke-rollback unreachable AFTER rollback — VM did not survive' + post_boot="$(vm_boot_id smoke-rollback)" + [[ -n "$post_boot" ]] || die 'post-rollback boot_id read failed' + [[ "$pre_boot" == "$post_boot" ]] \ + || die "VM rebooted during rollback drill: boot_id $pre_boot -> $post_boot" + + "$BANGER" vm delete smoke-rollback >/dev/null 2>&1 || true +} + +# daemon_admin must be the LAST scenario in the registry: `banger daemon +# stop` tears the installed services down, so anything after it that +# touches the daemon would fail. Cleanup re-stops idempotently and the +# uninstall path doesn't need active services. +scenario_daemon_admin() { + log "${SMOKE_DESCS[daemon_admin]}" + + local socket_out + socket_out="$("$BANGER" daemon socket)" || die 'daemon socket: command failed' + [[ "$socket_out" == "/run/banger/bangerd.sock" ]] \ + || die "daemon socket: got '$socket_out', want '/run/banger/bangerd.sock'" + + local mig_out + mig_out="$("$BANGERD" --system --check-migrations)" \ + || die "bangerd --check-migrations: non-zero exit (out: $mig_out)" + grep -q '^compatible:' <<<"$mig_out" \ + || die "bangerd --check-migrations: stdout missing 'compatible:' prefix; got: $mig_out" + + if ! sudo -n true 2>/dev/null; then + log 'daemon_admin: passwordless sudo unavailable; skipping daemon stop assertion' + return 0 + fi + sudo_banger "$BANGER" daemon stop >/dev/null || die 'banger daemon stop: command failed' + local status_out + status_out="$("$BANGER" system status 2>/dev/null || true)" + grep -qE '^active +inactive' <<<"$status_out" \ + || die "owner daemon still active after daemon stop: $status_out" + grep -qE '^helper_active +inactive' <<<"$status_out" \ + || die "root helper still active after daemon stop: $status_out" +} + # --------------------------------------------------------------------- # Dispatchers. # ---------------------------------------------------------------------