update: VMs survive banger update and rollback
Three load-bearing fixes that together let `banger update` (and its auto-rollback path) restart the helper + daemon without killing every running VM. New smoke scenarios prove the property end-to-end. Bug fixes: 1. Disable the firecracker SDK's signal-forwarding goroutine. The default ForwardSignals = [SIGINT, SIGQUIT, SIGTERM, SIGHUP, SIGABRT] installs a handler in the helper that propagates the helper's SIGTERM (sent by systemd on `systemctl stop bangerd- root.service`) to every running firecracker child. Set ForwardSignals to an empty (non-nil) slice so setupSignals short-circuits at len()==0. 2. Add SendSIGKILL=no to bangerd-root.service. KillMode=process limits the initial SIGTERM to the helper main, but systemd still SIGKILLs leftover cgroup processes during the FinalKillSignal stage unless SendSIGKILL=no. 3. Route restart-helper / restart-daemon / wait-daemon-ready failures through rollbackAndRestart instead of rollbackAndWrap. rollbackAndWrap restored .previous binaries but didn't re- restart the failed unit, leaving the helper dead with the rolled-back binary on disk after a failed update. Testing infrastructure (production binaries unaffected): - Hidden --manifest-url and --pubkey-file flags on `banger update` let the smoke harness redirect the updater at locally-built release artefacts. Marked Hidden in cobra; not advertised in --help. - FetchManifestFrom / VerifyBlobSignatureWithKey / FetchAndVerifySignatureWithKey export the existing logic against caller-supplied URL / pubkey. The default entry points still call them with the embedded canonical values. Smoke scenarios: - update_check: --check against fake manifest reports update available - update_to_unknown: --to v9.9.9 fails before any host mutation - update_no_root: refuses without sudo, install untouched - update_dry_run: stages + verifies, no swap, version unchanged - update_keeps_vm_alive: real swap to v0.smoke.0; same VM (same boot_id) answers SSH after the daemon restart - update_rollback_keeps_vm_alive: v0.smoke.broken-bangerd ships a bangerd that passes --check-migrations but exits 1 as the daemon. The post-swap `systemctl restart bangerd` fails, rollbackAndRestart fires, the .previous binaries are restored and re-restarted; the same VM still answers SSH afterwards - daemon_admin (separate prep): covers `banger daemon socket`, `bangerd --check-migrations --system`, `sudo banger daemon stop` The smoke release builder generates a fresh ECDSA P-256 keypair with openssl, signs SHA256SUMS cosign-compatibly, and serves artefacts from a backgrounded python http.server. verify_smoke_check_test.go pins the openssl/cosign signature equivalence so the smoke release builder can't silently drift. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
7e528f30b3
commit
2606bfbabb
8 changed files with 609 additions and 50 deletions
|
|
@ -364,18 +364,34 @@ func renderRootHelperSystemdUnit() string {
|
|||
"ExecStart=" + systemBangerdBin + " --root-helper",
|
||||
"Restart=on-failure",
|
||||
"RestartSec=1s",
|
||||
// KillMode=process is load-bearing: the helper unit's cgroup is
|
||||
// where every banger-launched firecracker process lives (see
|
||||
// validateFirecrackerPID). Without this, `systemctl restart
|
||||
// bangerd-root.service` — which `banger update` runs — would
|
||||
// SIGKILL every in-flight VM along with the helper because
|
||||
// systemd's default KillMode=control-group nukes the whole cgroup.
|
||||
// With process mode, only the helper PID is signaled; firecracker
|
||||
// children survive, the new helper instance re-attaches via the
|
||||
// helper RPC, daemon reconcile re-seeds in-memory state, VM keeps
|
||||
// running. `banger system uninstall` and the daemon's vm-stop
|
||||
// path explicitly stop firecracker processes when actually needed.
|
||||
// KillMode=process + SendSIGKILL=no together make the helper
|
||||
// safe to restart while banger-launched firecrackers are
|
||||
// running. firecracker lives in this unit's cgroup (jailer
|
||||
// doesn't open a sub-cgroup), so:
|
||||
//
|
||||
// - Default control-group mode SIGKILLs every process in
|
||||
// the cgroup on stop.
|
||||
// - KillMode=process limits the initial SIGTERM to the
|
||||
// helper main PID; systemd leaves remaining cgroup
|
||||
// processes alone (and logs "Unit process N (firecracker)
|
||||
// remains running after unit stopped").
|
||||
// - SendSIGKILL=no disables the FinalKillSignal escalation
|
||||
// that would otherwise SIGKILL leftovers after the timeout.
|
||||
//
|
||||
// One more pitfall: the firecracker SDK installs a default
|
||||
// signal-forwarding goroutine in the helper that catches
|
||||
// SIGTERM (etc.) and forwards it to every firecracker child.
|
||||
// We disable that explicitly via ForwardSignals: []os.Signal{}
|
||||
// in firecracker.buildConfig — without that override, systemd
|
||||
// signaling the helper main would propagate to every running
|
||||
// VM regardless of what these directives do.
|
||||
//
|
||||
// `banger system uninstall` and the daemon's vm-stop path
|
||||
// explicitly stop firecracker processes when actually needed,
|
||||
// so we don't lose the systemd-driven kill as a real safety
|
||||
// net — banger drives those kills itself.
|
||||
"KillMode=process",
|
||||
"SendSIGKILL=no",
|
||||
"Environment=PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
|
||||
"Environment=TMPDIR=" + installmeta.DefaultRootHelperRuntimeDir,
|
||||
"UMask=0077",
|
||||
|
|
|
|||
|
|
@ -30,10 +30,12 @@ const stagingTarballName = "release.tar.gz"
|
|||
|
||||
func (d *deps) newUpdateCommand() *cobra.Command {
|
||||
var (
|
||||
checkOnly bool
|
||||
dryRun bool
|
||||
force bool
|
||||
toVersion string
|
||||
checkOnly bool
|
||||
dryRun bool
|
||||
force bool
|
||||
toVersion string
|
||||
manifestURL string
|
||||
pubkeyFile string
|
||||
)
|
||||
cmd := &cobra.Command{
|
||||
Use: "update",
|
||||
|
|
@ -68,10 +70,12 @@ talks to systemd. Run with sudo.
|
|||
Args: noArgsUsage("usage: banger update [--check] [--dry-run] [--force] [--to vX.Y.Z]"),
|
||||
RunE: func(cmd *cobra.Command, args []string) error {
|
||||
return d.runUpdate(cmd, runUpdateOpts{
|
||||
checkOnly: checkOnly,
|
||||
dryRun: dryRun,
|
||||
force: force,
|
||||
toVersion: toVersion,
|
||||
checkOnly: checkOnly,
|
||||
dryRun: dryRun,
|
||||
force: force,
|
||||
toVersion: toVersion,
|
||||
manifestURL: manifestURL,
|
||||
pubkeyFile: pubkeyFile,
|
||||
})
|
||||
},
|
||||
}
|
||||
|
|
@ -79,23 +83,53 @@ talks to systemd. Run with sudo.
|
|||
cmd.Flags().BoolVar(&dryRun, "dry-run", false, "fetch and verify, but do not swap or restart anything")
|
||||
cmd.Flags().BoolVar(&force, "force", false, "skip in-flight-op refusal and post-restart doctor verification")
|
||||
cmd.Flags().StringVar(&toVersion, "to", "", "specific release version to install (default: latest_stable from manifest)")
|
||||
// Hidden test/dev hooks: redirect the updater at a non-default
|
||||
// manifest URL and trust a non-default cosign public key. Used by
|
||||
// the smoke suite to drive a real update against locally-built
|
||||
// release artefacts. Production users have no reason to touch
|
||||
// these; they are not advertised in --help.
|
||||
cmd.Flags().StringVar(&manifestURL, "manifest-url", "", "")
|
||||
cmd.Flags().StringVar(&pubkeyFile, "pubkey-file", "", "")
|
||||
_ = cmd.Flags().MarkHidden("manifest-url")
|
||||
_ = cmd.Flags().MarkHidden("pubkey-file")
|
||||
return cmd
|
||||
}
|
||||
|
||||
type runUpdateOpts struct {
|
||||
checkOnly bool
|
||||
dryRun bool
|
||||
force bool
|
||||
toVersion string
|
||||
checkOnly bool
|
||||
dryRun bool
|
||||
force bool
|
||||
toVersion string
|
||||
manifestURL string
|
||||
pubkeyFile string
|
||||
}
|
||||
|
||||
func (d *deps) runUpdate(cmd *cobra.Command, opts runUpdateOpts) error {
|
||||
ctx := cmd.Context()
|
||||
out := cmd.OutOrStdout()
|
||||
|
||||
// Resolve the test/dev override flags up front so a bad
|
||||
// --pubkey-file fails fast before any network round-trips.
|
||||
pubKeyPEM := updater.BangerReleasePublicKey
|
||||
if strings.TrimSpace(opts.pubkeyFile) != "" {
|
||||
body, err := os.ReadFile(opts.pubkeyFile)
|
||||
if err != nil {
|
||||
return fmt.Errorf("read --pubkey-file: %w", err)
|
||||
}
|
||||
pubKeyPEM = string(body)
|
||||
}
|
||||
|
||||
// Discover.
|
||||
client := &http.Client{Timeout: 30 * time.Second}
|
||||
manifest, err := updater.FetchManifest(ctx, client)
|
||||
var (
|
||||
manifest updater.Manifest
|
||||
err error
|
||||
)
|
||||
if strings.TrimSpace(opts.manifestURL) != "" {
|
||||
manifest, err = updater.FetchManifestFrom(ctx, client, opts.manifestURL)
|
||||
} else {
|
||||
manifest, err = updater.FetchManifest(ctx, client)
|
||||
}
|
||||
if err != nil {
|
||||
return fmt.Errorf("discover: %w", err)
|
||||
}
|
||||
|
|
@ -142,7 +176,7 @@ func (d *deps) runUpdate(cmd *cobra.Command, opts runUpdateOpts) error {
|
|||
if err != nil {
|
||||
return fmt.Errorf("download: %w", err)
|
||||
}
|
||||
if err := updater.FetchAndVerifySignature(ctx, client, target, sumsBody); err != nil {
|
||||
if err := updater.FetchAndVerifySignatureWithKey(ctx, client, target, sumsBody, pubKeyPEM); err != nil {
|
||||
// Don't leave the staged tarball around — it failed
|
||||
// signature verification and shouldn't be re-runnable.
|
||||
_ = os.Remove(tarballPath)
|
||||
|
|
@ -179,15 +213,21 @@ func (d *deps) runUpdate(cmd *cobra.Command, opts runUpdateOpts) error {
|
|||
return fmt.Errorf("swap: %w (rolled back)", err)
|
||||
}
|
||||
|
||||
// Restart services + wait for the new daemon.
|
||||
// Restart services + wait for the new daemon. A `systemctl restart`
|
||||
// that fails has typically already STOPPED the unit, so the prior
|
||||
// binary on disk isn't running anywhere — Rollback() must be paired
|
||||
// with a re-restart to bring the rolled-back binary back into a
|
||||
// running state. That's rollbackAndRestart's job; rollbackAndWrap
|
||||
// is for the swap-step failures earlier where the restart never
|
||||
// fired and the old binary is still in memory.
|
||||
if err := d.runSystemctl(ctx, "restart", installmeta.DefaultRootHelperService); err != nil {
|
||||
return rollbackAndWrap(swap, "restart helper", err)
|
||||
return rollbackAndRestart(ctx, d, swap, "restart helper", err)
|
||||
}
|
||||
if err := d.runSystemctl(ctx, "restart", installmeta.DefaultService); err != nil {
|
||||
return rollbackAndWrap(swap, "restart daemon", err)
|
||||
return rollbackAndRestart(ctx, d, swap, "restart daemon", err)
|
||||
}
|
||||
if err := d.waitForDaemonReady(ctx, socketPath); err != nil {
|
||||
return rollbackAndWrap(swap, "wait daemon ready", err)
|
||||
return rollbackAndRestart(ctx, d, swap, "wait daemon ready", err)
|
||||
}
|
||||
|
||||
// Verify with doctor unless --force says otherwise.
|
||||
|
|
|
|||
|
|
@ -178,7 +178,15 @@ func TestRenderRootHelperSystemdUnitIncludesRequiredCapabilities(t *testing.T) {
|
|||
|
||||
for _, want := range []string{
|
||||
"ExecStart=/usr/local/bin/bangerd --root-helper",
|
||||
// Both directives are load-bearing for "VM survives helper
|
||||
// restart": KillMode=process limits the initial SIGTERM to
|
||||
// the helper main, SendSIGKILL=no disables the SIGKILL
|
||||
// escalation. The helper itself does the cgroup reparent
|
||||
// (see roothelper.reparentToBangerFCCgroup) — without
|
||||
// that, even these directives leave firecracker exposed to
|
||||
// systemd's stop-time cleanup.
|
||||
"KillMode=process",
|
||||
"SendSIGKILL=no",
|
||||
"Environment=TMPDIR=/run/banger-root",
|
||||
"NoNewPrivileges=yes",
|
||||
"PrivateTmp=yes",
|
||||
|
|
|
|||
|
|
@ -196,6 +196,15 @@ func buildConfig(cfg MachineConfig) sdk.Config {
|
|||
Smt: sdk.Bool(false),
|
||||
},
|
||||
VMID: cfg.VMID,
|
||||
// Disable the SDK's signal-forwarding goroutine. Default
|
||||
// (nil) makes the SDK install a handler that catches
|
||||
// SIGTERM/SIGINT/SIGHUP/SIGQUIT/SIGABRT in the parent process
|
||||
// and forwards them to the firecracker child — which means
|
||||
// `systemctl stop bangerd-root.service` (sends SIGTERM to the
|
||||
// helper) ends up signaling every firecracker the helper has
|
||||
// launched, killing every running VM. Empty slice (not nil)
|
||||
// short-circuits setupSignals at len()==0.
|
||||
ForwardSignals: []os.Signal{},
|
||||
}
|
||||
if cfg.Jailer != nil {
|
||||
// The path fields above are already chroot-translated by the
|
||||
|
|
@ -267,6 +276,7 @@ func defaultDriveID(drive DriveConfig, fallback string) string {
|
|||
// the configured UID:GID) — see fcproc.PrepareJailerChroot. The SDK's own
|
||||
// JailerCfg path is intentionally bypassed: it cannot mknod block devices and
|
||||
// does not expose --new-pid-ns.
|
||||
//
|
||||
func buildProcessRunner(cfg MachineConfig, logFile *os.File) *exec.Cmd {
|
||||
var bin string
|
||||
var args []string
|
||||
|
|
@ -277,9 +287,10 @@ func buildProcessRunner(cfg MachineConfig, logFile *os.File) *exec.Cmd {
|
|||
args = []string{"--api-sock", cfg.SocketPath, "--id", cfg.VMID}
|
||||
}
|
||||
var cmd *exec.Cmd
|
||||
if os.Geteuid() == 0 {
|
||||
switch {
|
||||
case os.Geteuid() == 0:
|
||||
cmd = exec.Command(bin, args...)
|
||||
} else {
|
||||
default:
|
||||
cmd = exec.Command("sudo", append([]string{"-n", "-E", bin}, args...)...)
|
||||
}
|
||||
cmd.Stdin = nil
|
||||
|
|
|
|||
|
|
@ -75,15 +75,23 @@ type Release struct {
|
|||
// Release.
|
||||
const ManifestSchemaVersion = 1
|
||||
|
||||
// FetchManifest downloads the release manifest and validates its
|
||||
// shape. Returns an error if the server is unreachable, returns
|
||||
// non-2xx, exceeds the size cap, or the schema_version is newer
|
||||
// than this CLI knows.
|
||||
// FetchManifest downloads the release manifest from the embedded
|
||||
// canonical URL and validates its shape. Returns an error if the
|
||||
// server is unreachable, returns non-2xx, exceeds the size cap, or
|
||||
// the schema_version is newer than this CLI knows.
|
||||
func FetchManifest(ctx context.Context, client *http.Client) (Manifest, error) {
|
||||
return FetchManifestFrom(ctx, client, manifestURL)
|
||||
}
|
||||
|
||||
// FetchManifestFrom is FetchManifest against an explicit URL. Used by
|
||||
// the smoke suite (via `banger update --manifest-url …`) to drive the
|
||||
// updater against a locally-served fake manifest. Production callers
|
||||
// stick with FetchManifest.
|
||||
func FetchManifestFrom(ctx context.Context, client *http.Client, url string) (Manifest, error) {
|
||||
if client == nil {
|
||||
client = http.DefaultClient
|
||||
}
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, manifestURL, nil)
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
|
||||
if err != nil {
|
||||
return Manifest{}, err
|
||||
}
|
||||
|
|
|
|||
|
|
@ -61,18 +61,26 @@ var ErrSignatureRequired = errors.New("banger release public key is the placehol
|
|||
|
||||
// VerifyBlobSignature checks that sigBase64 is a valid cosign-blob
|
||||
// signature over body, made with the private counterpart of
|
||||
// BangerReleasePublicKey. cosign's blob signature format is a
|
||||
// base64-encoded ASN.1-DER ECDSA signature over SHA256(body) — that's
|
||||
// what the package's ecdsa.VerifyASN1 verifies natively.
|
||||
//
|
||||
// Refuses outright if the embedded public key is still the build-
|
||||
// time placeholder, so an unset key can't slip through as
|
||||
// "verification disabled."
|
||||
// BangerReleasePublicKey.
|
||||
func VerifyBlobSignature(body, sigBase64 []byte) error {
|
||||
if isPlaceholderKey(BangerReleasePublicKey) {
|
||||
return VerifyBlobSignatureWithKey(body, sigBase64, BangerReleasePublicKey)
|
||||
}
|
||||
|
||||
// VerifyBlobSignatureWithKey is VerifyBlobSignature against an
|
||||
// explicit PEM-encoded public key. Used by the smoke suite (via
|
||||
// `banger update --pubkey-file …`) so an end-to-end update test can
|
||||
// trust a locally-generated keypair without rebuilding the binary.
|
||||
//
|
||||
// Refuses outright if pubKeyPEM is the build-time placeholder so an
|
||||
// unset key can't slip through as "verification disabled".
|
||||
//
|
||||
// cosign's blob signature format is a base64-encoded ASN.1-DER ECDSA
|
||||
// signature over SHA256(body) — that's what ecdsa.VerifyASN1 takes.
|
||||
func VerifyBlobSignatureWithKey(body, sigBase64 []byte, pubKeyPEM string) error {
|
||||
if isPlaceholderKey(pubKeyPEM) {
|
||||
return ErrSignatureRequired
|
||||
}
|
||||
block, _ := pem.Decode([]byte(BangerReleasePublicKey))
|
||||
block, _ := pem.Decode([]byte(pubKeyPEM))
|
||||
if block == nil {
|
||||
return fmt.Errorf("decode banger release public key: no PEM block")
|
||||
}
|
||||
|
|
@ -96,15 +104,21 @@ func VerifyBlobSignature(body, sigBase64 []byte) error {
|
|||
}
|
||||
|
||||
// FetchAndVerifySignature pulls the SHA256SUMS.sig URL from the
|
||||
// release, downloads it (capped), and verifies it against
|
||||
// sumsBody. Returns nil on a clean pass, or an error describing
|
||||
// exactly why verification failed.
|
||||
// release, downloads it (capped), and verifies it against sumsBody.
|
||||
// Returns nil on a clean pass, or an error describing exactly why
|
||||
// verification failed.
|
||||
//
|
||||
// If release.SHA256SumsSigURL is empty, treat that as "release was
|
||||
// not signed" — refuse rather than silently proceeding. v0.1.0
|
||||
// requires every release to be cosign-signed; an unsigned release
|
||||
// is a manifest publishing bug we'd rather catch loudly.
|
||||
func FetchAndVerifySignature(ctx context.Context, client *http.Client, release Release, sumsBody []byte) error {
|
||||
return FetchAndVerifySignatureWithKey(ctx, client, release, sumsBody, BangerReleasePublicKey)
|
||||
}
|
||||
|
||||
// FetchAndVerifySignatureWithKey is FetchAndVerifySignature against
|
||||
// an explicit PEM-encoded public key.
|
||||
func FetchAndVerifySignatureWithKey(ctx context.Context, client *http.Client, release Release, sumsBody []byte, pubKeyPEM string) error {
|
||||
if strings.TrimSpace(release.SHA256SumsSigURL) == "" {
|
||||
return fmt.Errorf("release %s has no sha256sums_sig_url; refusing to install an unsigned release", release.Version)
|
||||
}
|
||||
|
|
@ -115,7 +129,7 @@ func FetchAndVerifySignature(ctx context.Context, client *http.Client, release R
|
|||
if err != nil {
|
||||
return fmt.Errorf("fetch signature: %w", err)
|
||||
}
|
||||
if err := VerifyBlobSignature(sumsBody, sig); err != nil {
|
||||
if err := VerifyBlobSignatureWithKey(sumsBody, sig, pubKeyPEM); err != nil {
|
||||
return fmt.Errorf("verify SHA256SUMS signature: %w", err)
|
||||
}
|
||||
return nil
|
||||
|
|
|
|||
54
internal/updater/verify_smoke_check_test.go
Normal file
54
internal/updater/verify_smoke_check_test.go
Normal file
|
|
@ -0,0 +1,54 @@
|
|||
package updater
|
||||
|
||||
import (
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// TestVerifyBlobSignatureWithOpenSSL is a confidence test for the
|
||||
// smoke release-builder path: openssl's `dgst -sha256 -sign` produces
|
||||
// the exact same encoding cosign emits for blob signatures (base64
|
||||
// ASN.1 ECDSA over SHA256(body)). If this ever stops verifying, the
|
||||
// smoke update scenarios will silently skip the signature check —
|
||||
// catching it here avoids a heisenbug in scripts/smoke.sh.
|
||||
func TestVerifyBlobSignatureWithOpenSSL(t *testing.T) {
|
||||
if _, err := exec.LookPath("openssl"); err != nil {
|
||||
t.Skip("openssl not on PATH")
|
||||
}
|
||||
dir := t.TempDir()
|
||||
keyPath := filepath.Join(dir, "cosign.key")
|
||||
pubPath := filepath.Join(dir, "cosign.pub")
|
||||
bodyPath := filepath.Join(dir, "body.txt")
|
||||
sigPath := filepath.Join(dir, "body.sig")
|
||||
|
||||
mustRun := func(name string, args ...string) {
|
||||
t.Helper()
|
||||
out, err := exec.Command(name, args...).CombinedOutput()
|
||||
if err != nil {
|
||||
t.Fatalf("%s %v: %v\n%s", name, args, err, string(out))
|
||||
}
|
||||
}
|
||||
|
||||
mustRun("openssl", "ecparam", "-name", "prime256v1", "-genkey", "-noout", "-out", keyPath)
|
||||
mustRun("openssl", "ec", "-in", keyPath, "-pubout", "-out", pubPath)
|
||||
mustRun("sh", "-c", "printf 'banger smoke release sums\n' > "+bodyPath)
|
||||
mustRun("sh", "-c", "openssl dgst -sha256 -sign "+keyPath+" "+bodyPath+" | base64 -w0 > "+sigPath)
|
||||
|
||||
body := readFile(t, bodyPath)
|
||||
sig := readFile(t, sigPath)
|
||||
pub := readFile(t, pubPath)
|
||||
|
||||
if err := VerifyBlobSignatureWithKey(body, sig, string(pub)); err != nil {
|
||||
t.Fatalf("VerifyBlobSignatureWithKey: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func readFile(t *testing.T, p string) []byte {
|
||||
t.Helper()
|
||||
out, err := exec.Command("cat", p).Output()
|
||||
if err != nil {
|
||||
t.Fatalf("read %s: %v", p, err)
|
||||
}
|
||||
return out
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue