Two defects compounded to make `vm create X` → `vm stop X` → `vm start X`
→ `vm ssh X` fail with `not_running: vm X is not running` even though
`vm show` reports `state=running`.
1. firecracker-go-sdk's startVMM spawns a goroutine that SIGTERMs
firecracker when the ctx passed to Machine.Start cancels — and
retains that ctx for the lifetime of the VMM, not just the boot
phase. Our Machine.Start wrapper was plumbing the caller's ctx
through, which on `vm.start` is the RPC request ctx. daemon.go's
handleConn cancels reqCtx via `defer cancel()` right after
writing the response. Net effect: firecracker is killed ~150ms
after the `vm start` RPC "completes", invisibly, and the next
`vm ssh` sees a dead PID. `vm.create` side-stepped the bug
because BeginVMCreate detaches to context.Background() before
calling startVMLocked; `vm.start` used the RPC ctx directly.
Fix: Machine.Start now passes context.Background() to the SDK.
We own firecracker lifecycle explicitly (StopVM / KillVM /
cleanupRuntime), so ctx-driven cancellation here was never
actually wired into anything useful.
2. With (1) fixed, the same scenario exposed a second defect:
patchRootOverlay's e2cp/e2rm refuses to touch the dm-snapshot
with "Inode bitmap checksum does not match bitmap" on a restart,
because the COW holds stale free-block/free-inode counters from
the previous guest boot. Kernel ext4 is fine with this; e2fsprogs
is not. Fix: run `e2fsck -fy` on the snapshot between the
dm_snapshot and patch_root_overlay stages. Idempotent on a fresh
snapshot, reconciles the bitmaps on a reused COW.
Regression coverage:
- scripts/repro-restart-bug.sh — minimal create→stop→start→ssh
reproducer with rich on-failure diagnostics (daemon log trace,
firecracker.log tail, handles.json, pgrep-by-apiSock, apiSock
stat). Exits non-zero if the bug returns.
- scripts/smoke.sh — lifecycle scenario (create/ssh/stop/start/
ssh/delete) and vm-set scenario (--vcpu 2 → stop → set --vcpu 4
→ start → assert nproc=4). Both were pulled when the bug was
first found; now restored.
Supporting:
- internal/system/system.ExitCode — extracts exec.ExitError's
code without forcing callers to import os/exec. Needed by the
e2fsck caller (policy test pins os/exec to the shell-out
packages).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
310 lines
8.7 KiB
Go
310 lines
8.7 KiB
Go
package firecracker
|
||
|
||
import (
|
||
"context"
|
||
"io"
|
||
"log/slog"
|
||
"os"
|
||
"os/exec"
|
||
"strings"
|
||
"sync"
|
||
|
||
sdk "github.com/firecracker-microvm/firecracker-go-sdk"
|
||
models "github.com/firecracker-microvm/firecracker-go-sdk/client/models"
|
||
"github.com/sirupsen/logrus"
|
||
|
||
"banger/internal/vsockagent"
|
||
)
|
||
|
||
type MachineConfig struct {
|
||
BinaryPath string
|
||
VMID string
|
||
SocketPath string
|
||
LogPath string
|
||
MetricsPath string
|
||
KernelImagePath string
|
||
InitrdPath string
|
||
KernelArgs string
|
||
Drives []DriveConfig
|
||
TapDevice string
|
||
VSockPath string
|
||
VSockCID uint32
|
||
VCPUCount int
|
||
MemoryMiB int
|
||
Logger *slog.Logger
|
||
}
|
||
|
||
type DriveConfig struct {
|
||
ID string
|
||
Path string
|
||
ReadOnly bool
|
||
IsRoot bool
|
||
}
|
||
|
||
type Machine struct {
|
||
machine *sdk.Machine
|
||
logFile *os.File
|
||
closeOnce sync.Once
|
||
}
|
||
|
||
type Client struct {
|
||
client *sdk.Client
|
||
}
|
||
|
||
func NewMachine(ctx context.Context, cfg MachineConfig) (*Machine, error) {
|
||
logFile, err := openLogFile(cfg.LogPath)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
|
||
cmd := buildProcessRunner(cfg, logFile)
|
||
machine, err := sdk.NewMachine(
|
||
ctx,
|
||
buildConfig(cfg),
|
||
sdk.WithProcessRunner(cmd),
|
||
sdk.WithLogger(newLogger(cfg.Logger)),
|
||
)
|
||
if err != nil {
|
||
if logFile != nil {
|
||
_ = logFile.Close()
|
||
}
|
||
return nil, err
|
||
}
|
||
|
||
return &Machine{machine: machine, logFile: logFile}, nil
|
||
}
|
||
|
||
func (m *Machine) Start(ctx context.Context) error {
|
||
// The caller's ctx is INTENTIONALLY not forwarded to the SDK.
|
||
// firecracker-go-sdk's startVMM (machine.go) spawns a goroutine
|
||
// that SIGTERMs firecracker the instant this ctx cancels, and
|
||
// retains it for the lifetime of the VMM — not just the boot
|
||
// phase. Plumbing an RPC request ctx through would mean
|
||
// firecracker dies the moment the daemon writes its RPC response
|
||
// (daemon.go:handleConn defers cancel). That silently breaks
|
||
// `vm start` on a stopped VM: start "succeeds", the handler
|
||
// returns, ctx cancels, firecracker is SIGTERMed, and the next
|
||
// `vm ssh` hits `vmAlive = false`. `vm.create` sidesteps the bug
|
||
// because BeginVMCreate detaches to a background ctx before
|
||
// calling startVMLocked.
|
||
//
|
||
// We own firecracker lifecycle explicitly — StopVM / KillVM /
|
||
// cleanupRuntime — so losing ctx-driven cancellation here is
|
||
// deliberate. The SDK still enforces its own boot-phase timeouts
|
||
// (socket wait, HTTP) with internal deadlines.
|
||
if err := m.machine.Start(context.Background()); err != nil {
|
||
m.closeLog()
|
||
return err
|
||
}
|
||
_ = ctx
|
||
|
||
go func() {
|
||
_ = m.machine.Wait(context.Background())
|
||
m.closeLog()
|
||
}()
|
||
|
||
return nil
|
||
}
|
||
|
||
func (m *Machine) PID() (int, error) {
|
||
return m.machine.PID()
|
||
}
|
||
|
||
func New(apiSock string, logger *slog.Logger) *Client {
|
||
return &Client{client: sdk.NewClient(apiSock, newLogger(logger), false)}
|
||
}
|
||
|
||
func (c *Client) SendCtrlAltDel(ctx context.Context) error {
|
||
action := models.InstanceActionInfoActionTypeSendCtrlAltDel
|
||
_, err := c.client.CreateSyncAction(ctx, &models.InstanceActionInfo{
|
||
ActionType: &action,
|
||
})
|
||
return err
|
||
}
|
||
|
||
func openLogFile(path string) (*os.File, error) {
|
||
if path == "" {
|
||
return nil, nil
|
||
}
|
||
return os.OpenFile(path, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0o644)
|
||
}
|
||
|
||
func buildConfig(cfg MachineConfig) sdk.Config {
|
||
rootDrive, extraDrives := splitDrives(cfg.Drives)
|
||
drivesBuilder := sdk.NewDrivesBuilder(rootDrive.Path).
|
||
WithRootDrive(rootDrive.Path, sdk.WithDriveID(defaultDriveID(rootDrive, "rootfs")), sdk.WithReadOnly(rootDrive.ReadOnly))
|
||
for _, drive := range extraDrives {
|
||
if strings.TrimSpace(drive.Path) == "" {
|
||
continue
|
||
}
|
||
drivesBuilder = drivesBuilder.AddDrive(drive.Path, drive.ReadOnly, sdk.WithDriveID(defaultDriveID(drive, "drive")))
|
||
}
|
||
drives := drivesBuilder.Build()
|
||
|
||
return sdk.Config{
|
||
SocketPath: cfg.SocketPath,
|
||
LogPath: cfg.LogPath,
|
||
MetricsPath: cfg.MetricsPath,
|
||
KernelImagePath: cfg.KernelImagePath,
|
||
InitrdPath: cfg.InitrdPath,
|
||
KernelArgs: cfg.KernelArgs,
|
||
Drives: drives,
|
||
NetworkInterfaces: sdk.NetworkInterfaces{{
|
||
StaticConfiguration: &sdk.StaticNetworkConfiguration{
|
||
HostDevName: cfg.TapDevice,
|
||
},
|
||
}},
|
||
VsockDevices: buildVsockDevices(cfg),
|
||
MachineCfg: models.MachineConfiguration{
|
||
VcpuCount: sdk.Int64(int64(cfg.VCPUCount)),
|
||
MemSizeMib: sdk.Int64(int64(cfg.MemoryMiB)),
|
||
Smt: sdk.Bool(false),
|
||
},
|
||
VMID: cfg.VMID,
|
||
}
|
||
}
|
||
|
||
func buildVsockDevices(cfg MachineConfig) []sdk.VsockDevice {
|
||
if strings.TrimSpace(cfg.VSockPath) == "" || cfg.VSockCID == 0 {
|
||
return nil
|
||
}
|
||
return []sdk.VsockDevice{{
|
||
ID: "vsock",
|
||
Path: cfg.VSockPath,
|
||
CID: cfg.VSockCID,
|
||
}}
|
||
}
|
||
|
||
func splitDrives(drives []DriveConfig) (DriveConfig, []DriveConfig) {
|
||
root := DriveConfig{ID: "rootfs"}
|
||
var extras []DriveConfig
|
||
for _, drive := range drives {
|
||
if strings.TrimSpace(drive.Path) == "" {
|
||
continue
|
||
}
|
||
if drive.IsRoot {
|
||
root = drive
|
||
if root.ID == "" {
|
||
root.ID = "rootfs"
|
||
}
|
||
continue
|
||
}
|
||
extras = append(extras, drive)
|
||
}
|
||
return root, extras
|
||
}
|
||
|
||
func defaultDriveID(drive DriveConfig, fallback string) string {
|
||
if strings.TrimSpace(drive.ID) != "" {
|
||
return drive.ID
|
||
}
|
||
return fallback
|
||
}
|
||
|
||
func buildProcessRunner(cfg MachineConfig, logFile *os.File) *exec.Cmd {
|
||
// Two moving parts, run inside a single sudo'd shell:
|
||
//
|
||
// 1. umask 077 + exec firecracker → the API and vsock sockets
|
||
// firecracker creates are born 0600 owned by root (sudo user),
|
||
// not 0755. Without the umask there's a real window where a
|
||
// local attacker could hit the control plane.
|
||
//
|
||
// 2. A background subshell polls for each expected socket and
|
||
// chowns it to $SUDO_UID:$SUDO_GID as soon as it appears.
|
||
//
|
||
// The chown is required *before* the firecracker-go-sdk's
|
||
// waitForSocket returns from Machine.Start — the SDK does both an
|
||
// os.Stat and an HTTP GET over the socket, and AF_UNIX connect(2)
|
||
// needs write permission on the socket file. With the socket at
|
||
// 0600 root:root, the daemon process (running as the invoking
|
||
// user) gets EACCES on connect and the SDK loops until its 3s
|
||
// timeout. The daemon's post-Start EnsureSocketAccess chown would
|
||
// fix it, but Start never returns to hand control back.
|
||
//
|
||
// Racing the chown inside sudo's shell closes the gap: by the
|
||
// time the SDK's HTTP probe fires, the socket is already owned by
|
||
// the invoking user.
|
||
chownWatcher := func(path string) string {
|
||
// Bounded poll: 20 × 50ms = 1s. Matches the SDK's 3s wait
|
||
// budget with headroom and bails quietly if firecracker
|
||
// never creates the socket (e.g. bad args — the error
|
||
// surfaces through firecracker's non-zero exit).
|
||
return `for _ in $(seq 1 20); do [ -S ` + shellQuote(path) + ` ] && break; sleep 0.05; done; ` +
|
||
`[ -S ` + shellQuote(path) + ` ] && chown "$SUDO_UID:$SUDO_GID" ` + shellQuote(path) + ` || true`
|
||
}
|
||
watchers := chownWatcher(cfg.SocketPath)
|
||
if strings.TrimSpace(cfg.VSockPath) != "" {
|
||
watchers += "; " + chownWatcher(cfg.VSockPath)
|
||
}
|
||
script := "umask 077 && (" + watchers + ") & exec " + shellQuote(cfg.BinaryPath) +
|
||
" --api-sock " + shellQuote(cfg.SocketPath) +
|
||
" --id " + shellQuote(cfg.VMID)
|
||
// sudo -E preserves SUDO_UID / SUDO_GID (sudo sets them itself
|
||
// regardless, but -E is already the convention in this codebase
|
||
// and the background subshell needs them).
|
||
cmd := exec.Command("sudo", "-n", "-E", "sh", "-c", script)
|
||
cmd.Stdin = nil
|
||
if logFile != nil {
|
||
cmd.Stdout = logFile
|
||
cmd.Stderr = logFile
|
||
}
|
||
return cmd
|
||
}
|
||
|
||
func shellQuote(value string) string {
|
||
return "'" + strings.ReplaceAll(value, "'", `'"'"'`) + "'"
|
||
}
|
||
|
||
func newLogger(base *slog.Logger) *logrus.Entry {
|
||
logger := logrus.New()
|
||
logger.SetOutput(io.Discard)
|
||
logger.SetLevel(logrus.DebugLevel)
|
||
logger.AddHook(slogHook{logger: base})
|
||
return logrus.NewEntry(logger)
|
||
}
|
||
|
||
func HealthVSock(ctx context.Context, logger *slog.Logger, socketPath string) error {
|
||
return vsockagent.Health(ctx, logger, socketPath)
|
||
}
|
||
|
||
func PingVSock(ctx context.Context, logger *slog.Logger, socketPath string) error {
|
||
return HealthVSock(ctx, logger, socketPath)
|
||
}
|
||
|
||
type slogHook struct {
|
||
logger *slog.Logger
|
||
}
|
||
|
||
func (h slogHook) Levels() []logrus.Level {
|
||
return logrus.AllLevels
|
||
}
|
||
|
||
func (h slogHook) Fire(entry *logrus.Entry) error {
|
||
if h.logger == nil {
|
||
return nil
|
||
}
|
||
level := slog.LevelDebug
|
||
switch entry.Level {
|
||
case logrus.PanicLevel, logrus.FatalLevel, logrus.ErrorLevel:
|
||
level = slog.LevelError
|
||
case logrus.WarnLevel:
|
||
level = slog.LevelWarn
|
||
default:
|
||
level = slog.LevelDebug
|
||
}
|
||
attrs := make([]any, 0, len(entry.Data)*2+2)
|
||
attrs = append(attrs, "component", "firecracker_sdk")
|
||
for key, value := range entry.Data {
|
||
attrs = append(attrs, key, value)
|
||
}
|
||
h.logger.Log(context.Background(), level, entry.Message, attrs...)
|
||
return nil
|
||
}
|
||
|
||
func (m *Machine) closeLog() {
|
||
m.closeOnce.Do(func() {
|
||
if m.logFile != nil {
|
||
_ = m.logFile.Close()
|
||
}
|
||
})
|
||
}
|