firecracker: drop sudo sh -c, race chown against SDK probe in Go

Replace the shell-string launcher in buildProcessRunner with a direct
exec.Command. The previous sh -c wrapper relied on shellQuote escaping
for every MachineConfig field that flowed into the launch script; any
future field that ever carried an attacker-controlled value would have
become RCE-as-root. The new path passes binary path and flags as
separate argv entries, so there is no shell to interpret anything.

The wrapper also did two things the shell can no longer do for us:

  1. umask 077 — moved to syscall.Umask in cmd/bangerd/main.go so every
     firecracker child (and any other file the daemon creates) inherits
     0600 by default. Single-user dev sandbox state should be private.

  2. chown_watcher — the SDK's HTTP probe inside Machine.Start connects
     to the API socket the moment it appears. Under sudo the socket is
     created root-owned and the daemon's connect(2) gets EACCES, so the
     post-Start EnsureSocketAccess never runs. The shell papered over
     this with a backgrounded chown loop. Replaced by
     fcproc.EnsureSocketAccessForAsync: same race-window guarantee, in
     pure Go, kicked off in LaunchFirecracker right before Start and
     awaited right after.

Tests updated: shell-substring assertions replaced with cmd-arg
assertions, plus a new fcproc test pinning the async chown sequence.
Smoke (full systemd two-service install + KVM scenarios) passes.
This commit is contained in:
Thales Maciel 2026-04-27 20:14:01 -03:00
parent c4e1cb5953
commit d73efe6fbc
No known key found for this signature in database
GPG key ID: 33112E6833C34679
6 changed files with 181 additions and 91 deletions

View file

@ -13,6 +13,7 @@ import (
"os"
"strconv"
"strings"
"sync"
"time"
"banger/internal/firecracker"
@ -137,7 +138,64 @@ func (m *Manager) EnsureSocketAccess(ctx context.Context, socketPath, label stri
// EnsureSocketAccessFor waits for the socket to appear then chowns/chmods it
// to uid:gid, mode 0600.
func (m *Manager) EnsureSocketAccessFor(ctx context.Context, socketPath, label string, uid, gid int) error {
if err := waitForPath(ctx, socketPath, 5*time.Second, label); err != nil {
return m.ensureSocketAccessFor(ctx, socketPath, label, uid, gid, 5*time.Second, 100*time.Millisecond)
}
// EnsureSocketAccessForAsync runs EnsureSocketAccessFor concurrently for each
// non-empty path and returns a channel that receives a single error (nil on
// full success) once all per-path operations complete. Caller MUST receive on
// the channel to unblock the goroutine.
//
// Used during firecracker boot: the SDK's HTTP probe inside Machine.Start
// connects to the API socket the moment it appears. When firecracker is
// launched under sudo the socket is created root-owned, and the daemon's
// connect(2) gets EACCES until something chowns it. Running the chown
// concurrently with Start (instead of after Start returns, which deadlocks)
// closes the race without a shell-level chown_watcher.
//
// Uses a 25ms poll cadence (vs 100ms for the synchronous variant) to win
// against the SDK's tight HTTP retry loop.
func (m *Manager) EnsureSocketAccessForAsync(ctx context.Context, socketPaths []string, uid, gid int) <-chan error {
var clean []string
for _, p := range socketPaths {
if strings.TrimSpace(p) != "" {
clean = append(clean, p)
}
}
done := make(chan error, 1)
if len(clean) == 0 {
done <- nil
close(done)
return done
}
go func() {
defer close(done)
var wg sync.WaitGroup
errCh := make(chan error, len(clean))
for _, p := range clean {
wg.Add(1)
go func(path string) {
defer wg.Done()
if err := m.ensureSocketAccessFor(ctx, path, "firecracker socket", uid, gid, 3*time.Second, 25*time.Millisecond); err != nil {
errCh <- err
}
}(p)
}
wg.Wait()
close(errCh)
for err := range errCh {
if err != nil {
done <- err
return
}
}
done <- nil
}()
return done
}
func (m *Manager) ensureSocketAccessFor(ctx context.Context, socketPath, label string, uid, gid int, timeout, interval time.Duration) error {
if err := pollPath(ctx, socketPath, timeout, interval, label); err != nil {
return err
}
if os.Geteuid() == 0 {
@ -214,6 +272,10 @@ func (m *Manager) Kill(ctx context.Context, pid int) error {
}
func waitForPath(ctx context.Context, path string, timeout time.Duration, label string) error {
return pollPath(ctx, path, timeout, 100*time.Millisecond, label)
}
func pollPath(ctx context.Context, path string, timeout, interval time.Duration, label string) error {
deadline := time.Now().Add(timeout)
for {
if _, err := os.Stat(path); err == nil {
@ -227,7 +289,7 @@ func waitForPath(ctx context.Context, path string, timeout time.Duration, label
select {
case <-ctx.Done():
return ctx.Err()
case <-time.After(100 * time.Millisecond):
case <-time.After(interval):
}
}
}