update: VMs survive banger update and rollback
Three load-bearing fixes that together let `banger update` (and its auto-rollback path) restart the helper + daemon without killing every running VM. New smoke scenarios prove the property end-to-end. Bug fixes: 1. Disable the firecracker SDK's signal-forwarding goroutine. The default ForwardSignals = [SIGINT, SIGQUIT, SIGTERM, SIGHUP, SIGABRT] installs a handler in the helper that propagates the helper's SIGTERM (sent by systemd on `systemctl stop bangerd- root.service`) to every running firecracker child. Set ForwardSignals to an empty (non-nil) slice so setupSignals short-circuits at len()==0. 2. Add SendSIGKILL=no to bangerd-root.service. KillMode=process limits the initial SIGTERM to the helper main, but systemd still SIGKILLs leftover cgroup processes during the FinalKillSignal stage unless SendSIGKILL=no. 3. Route restart-helper / restart-daemon / wait-daemon-ready failures through rollbackAndRestart instead of rollbackAndWrap. rollbackAndWrap restored .previous binaries but didn't re- restart the failed unit, leaving the helper dead with the rolled-back binary on disk after a failed update. Testing infrastructure (production binaries unaffected): - Hidden --manifest-url and --pubkey-file flags on `banger update` let the smoke harness redirect the updater at locally-built release artefacts. Marked Hidden in cobra; not advertised in --help. - FetchManifestFrom / VerifyBlobSignatureWithKey / FetchAndVerifySignatureWithKey export the existing logic against caller-supplied URL / pubkey. The default entry points still call them with the embedded canonical values. Smoke scenarios: - update_check: --check against fake manifest reports update available - update_to_unknown: --to v9.9.9 fails before any host mutation - update_no_root: refuses without sudo, install untouched - update_dry_run: stages + verifies, no swap, version unchanged - update_keeps_vm_alive: real swap to v0.smoke.0; same VM (same boot_id) answers SSH after the daemon restart - update_rollback_keeps_vm_alive: v0.smoke.broken-bangerd ships a bangerd that passes --check-migrations but exits 1 as the daemon. The post-swap `systemctl restart bangerd` fails, rollbackAndRestart fires, the .previous binaries are restored and re-restarted; the same VM still answers SSH afterwards - daemon_admin (separate prep): covers `banger daemon socket`, `bangerd --check-migrations --system`, `sudo banger daemon stop` The smoke release builder generates a fresh ECDSA P-256 keypair with openssl, signs SHA256SUMS cosign-compatibly, and serves artefacts from a backgrounded python http.server. verify_smoke_check_test.go pins the openssl/cosign signature equivalence so the smoke release builder can't silently drift. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
7e528f30b3
commit
2606bfbabb
8 changed files with 609 additions and 50 deletions
|
|
@ -30,10 +30,12 @@ const stagingTarballName = "release.tar.gz"
|
|||
|
||||
func (d *deps) newUpdateCommand() *cobra.Command {
|
||||
var (
|
||||
checkOnly bool
|
||||
dryRun bool
|
||||
force bool
|
||||
toVersion string
|
||||
checkOnly bool
|
||||
dryRun bool
|
||||
force bool
|
||||
toVersion string
|
||||
manifestURL string
|
||||
pubkeyFile string
|
||||
)
|
||||
cmd := &cobra.Command{
|
||||
Use: "update",
|
||||
|
|
@ -68,10 +70,12 @@ talks to systemd. Run with sudo.
|
|||
Args: noArgsUsage("usage: banger update [--check] [--dry-run] [--force] [--to vX.Y.Z]"),
|
||||
RunE: func(cmd *cobra.Command, args []string) error {
|
||||
return d.runUpdate(cmd, runUpdateOpts{
|
||||
checkOnly: checkOnly,
|
||||
dryRun: dryRun,
|
||||
force: force,
|
||||
toVersion: toVersion,
|
||||
checkOnly: checkOnly,
|
||||
dryRun: dryRun,
|
||||
force: force,
|
||||
toVersion: toVersion,
|
||||
manifestURL: manifestURL,
|
||||
pubkeyFile: pubkeyFile,
|
||||
})
|
||||
},
|
||||
}
|
||||
|
|
@ -79,23 +83,53 @@ talks to systemd. Run with sudo.
|
|||
cmd.Flags().BoolVar(&dryRun, "dry-run", false, "fetch and verify, but do not swap or restart anything")
|
||||
cmd.Flags().BoolVar(&force, "force", false, "skip in-flight-op refusal and post-restart doctor verification")
|
||||
cmd.Flags().StringVar(&toVersion, "to", "", "specific release version to install (default: latest_stable from manifest)")
|
||||
// Hidden test/dev hooks: redirect the updater at a non-default
|
||||
// manifest URL and trust a non-default cosign public key. Used by
|
||||
// the smoke suite to drive a real update against locally-built
|
||||
// release artefacts. Production users have no reason to touch
|
||||
// these; they are not advertised in --help.
|
||||
cmd.Flags().StringVar(&manifestURL, "manifest-url", "", "")
|
||||
cmd.Flags().StringVar(&pubkeyFile, "pubkey-file", "", "")
|
||||
_ = cmd.Flags().MarkHidden("manifest-url")
|
||||
_ = cmd.Flags().MarkHidden("pubkey-file")
|
||||
return cmd
|
||||
}
|
||||
|
||||
type runUpdateOpts struct {
|
||||
checkOnly bool
|
||||
dryRun bool
|
||||
force bool
|
||||
toVersion string
|
||||
checkOnly bool
|
||||
dryRun bool
|
||||
force bool
|
||||
toVersion string
|
||||
manifestURL string
|
||||
pubkeyFile string
|
||||
}
|
||||
|
||||
func (d *deps) runUpdate(cmd *cobra.Command, opts runUpdateOpts) error {
|
||||
ctx := cmd.Context()
|
||||
out := cmd.OutOrStdout()
|
||||
|
||||
// Resolve the test/dev override flags up front so a bad
|
||||
// --pubkey-file fails fast before any network round-trips.
|
||||
pubKeyPEM := updater.BangerReleasePublicKey
|
||||
if strings.TrimSpace(opts.pubkeyFile) != "" {
|
||||
body, err := os.ReadFile(opts.pubkeyFile)
|
||||
if err != nil {
|
||||
return fmt.Errorf("read --pubkey-file: %w", err)
|
||||
}
|
||||
pubKeyPEM = string(body)
|
||||
}
|
||||
|
||||
// Discover.
|
||||
client := &http.Client{Timeout: 30 * time.Second}
|
||||
manifest, err := updater.FetchManifest(ctx, client)
|
||||
var (
|
||||
manifest updater.Manifest
|
||||
err error
|
||||
)
|
||||
if strings.TrimSpace(opts.manifestURL) != "" {
|
||||
manifest, err = updater.FetchManifestFrom(ctx, client, opts.manifestURL)
|
||||
} else {
|
||||
manifest, err = updater.FetchManifest(ctx, client)
|
||||
}
|
||||
if err != nil {
|
||||
return fmt.Errorf("discover: %w", err)
|
||||
}
|
||||
|
|
@ -142,7 +176,7 @@ func (d *deps) runUpdate(cmd *cobra.Command, opts runUpdateOpts) error {
|
|||
if err != nil {
|
||||
return fmt.Errorf("download: %w", err)
|
||||
}
|
||||
if err := updater.FetchAndVerifySignature(ctx, client, target, sumsBody); err != nil {
|
||||
if err := updater.FetchAndVerifySignatureWithKey(ctx, client, target, sumsBody, pubKeyPEM); err != nil {
|
||||
// Don't leave the staged tarball around — it failed
|
||||
// signature verification and shouldn't be re-runnable.
|
||||
_ = os.Remove(tarballPath)
|
||||
|
|
@ -179,15 +213,21 @@ func (d *deps) runUpdate(cmd *cobra.Command, opts runUpdateOpts) error {
|
|||
return fmt.Errorf("swap: %w (rolled back)", err)
|
||||
}
|
||||
|
||||
// Restart services + wait for the new daemon.
|
||||
// Restart services + wait for the new daemon. A `systemctl restart`
|
||||
// that fails has typically already STOPPED the unit, so the prior
|
||||
// binary on disk isn't running anywhere — Rollback() must be paired
|
||||
// with a re-restart to bring the rolled-back binary back into a
|
||||
// running state. That's rollbackAndRestart's job; rollbackAndWrap
|
||||
// is for the swap-step failures earlier where the restart never
|
||||
// fired and the old binary is still in memory.
|
||||
if err := d.runSystemctl(ctx, "restart", installmeta.DefaultRootHelperService); err != nil {
|
||||
return rollbackAndWrap(swap, "restart helper", err)
|
||||
return rollbackAndRestart(ctx, d, swap, "restart helper", err)
|
||||
}
|
||||
if err := d.runSystemctl(ctx, "restart", installmeta.DefaultService); err != nil {
|
||||
return rollbackAndWrap(swap, "restart daemon", err)
|
||||
return rollbackAndRestart(ctx, d, swap, "restart daemon", err)
|
||||
}
|
||||
if err := d.waitForDaemonReady(ctx, socketPath); err != nil {
|
||||
return rollbackAndWrap(swap, "wait daemon ready", err)
|
||||
return rollbackAndRestart(ctx, d, swap, "wait daemon ready", err)
|
||||
}
|
||||
|
||||
// Verify with doctor unless --force says otherwise.
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue