banger/internal/daemon/fcproc/fcproc_test.go
Thales Maciel d73efe6fbc
firecracker: drop sudo sh -c, race chown against SDK probe in Go
Replace the shell-string launcher in buildProcessRunner with a direct
exec.Command. The previous sh -c wrapper relied on shellQuote escaping
for every MachineConfig field that flowed into the launch script; any
future field that ever carried an attacker-controlled value would have
become RCE-as-root. The new path passes binary path and flags as
separate argv entries, so there is no shell to interpret anything.

The wrapper also did two things the shell can no longer do for us:

  1. umask 077 — moved to syscall.Umask in cmd/bangerd/main.go so every
     firecracker child (and any other file the daemon creates) inherits
     0600 by default. Single-user dev sandbox state should be private.

  2. chown_watcher — the SDK's HTTP probe inside Machine.Start connects
     to the API socket the moment it appears. Under sudo the socket is
     created root-owned and the daemon's connect(2) gets EACCES, so the
     post-Start EnsureSocketAccess never runs. The shell papered over
     this with a backgrounded chown loop. Replaced by
     fcproc.EnsureSocketAccessForAsync: same race-window guarantee, in
     pure Go, kicked off in LaunchFirecracker right before Start and
     awaited right after.

Tests updated: shell-substring assertions replaced with cmd-arg
assertions, plus a new fcproc test pinning the async chown sequence.
Smoke (full systemd two-service install + KVM scenarios) passes.
2026-04-27 20:14:01 -03:00

242 lines
8.1 KiB
Go

package fcproc
import (
"context"
"errors"
"log/slog"
"os"
"path/filepath"
"testing"
"time"
)
// scriptedRunner is a minimal Runner that records every call and
// plays back a pre-scripted sequence of (name, args, out, err)
// steps. Failing to match or running past the script fails the
// test. Mirrors the pattern from internal/daemon/snapshot_test.go
// but lives here because fcproc is a leaf package — it can't import
// its parent's test helpers.
type scriptedRunner struct {
t *testing.T
runs []scriptedCall
sudos []scriptedCall
}
type scriptedCall struct {
matchName string // empty for RunSudo (sudo has no distinct name arg)
matchArgs []string // nil means "don't care"
out []byte
err error
}
func (r *scriptedRunner) Run(_ context.Context, name string, args ...string) ([]byte, error) {
r.t.Helper()
if len(r.runs) == 0 {
r.t.Fatalf("unexpected Run(%q, %v)", name, args)
}
step := r.runs[0]
r.runs = r.runs[1:]
if step.matchName != "" && step.matchName != name {
r.t.Fatalf("Run name = %q, want %q", name, step.matchName)
}
return step.out, step.err
}
func (r *scriptedRunner) RunSudo(_ context.Context, args ...string) ([]byte, error) {
r.t.Helper()
if len(r.sudos) == 0 {
r.t.Fatalf("unexpected RunSudo(%v)", args)
}
step := r.sudos[0]
r.sudos = r.sudos[1:]
return step.out, step.err
}
// TestWaitForPathReturnsDeadlineExceededWhenSocketNeverAppears pins
// the timeout branch of waitForPath. If this drifts, every callsite
// that wraps it (EnsureSocketAccess on the firecracker API +
// vsock sockets) loses its bounded wait.
func TestWaitForPathReturnsDeadlineExceededWhenSocketNeverAppears(t *testing.T) {
missing := filepath.Join(t.TempDir(), "never-created.sock")
start := time.Now()
err := waitForPath(context.Background(), missing, 150*time.Millisecond, "api socket")
elapsed := time.Since(start)
if !errors.Is(err, context.DeadlineExceeded) {
t.Fatalf("err = %v, want wrapped context.DeadlineExceeded", err)
}
if !contains(err.Error(), "api socket") {
t.Fatalf("err = %v, want label 'api socket' in message", err)
}
// Timeout should fire close to the configured budget, not zero
// (tight-loop regression) and not way over (missing select
// regression). The 100ms poll tick plus the initial stat makes
// the lower bound noisy; check we at least waited a tick.
if elapsed < 90*time.Millisecond {
t.Fatalf("returned after %s; waitForPath exited before its timeout budget", elapsed)
}
}
// TestWaitForPathReturnsOnceSocketAppears pins the happy path:
// when the file materialises mid-wait, the function returns nil
// without having to walk to its deadline.
func TestWaitForPathReturnsOnceSocketAppears(t *testing.T) {
socketPath := filepath.Join(t.TempDir(), "will-appear.sock")
go func() {
time.Sleep(50 * time.Millisecond)
_ = os.WriteFile(socketPath, []byte{}, 0o600)
}()
if err := waitForPath(context.Background(), socketPath, 2*time.Second, "api socket"); err != nil {
t.Fatalf("waitForPath: %v", err)
}
}
// TestWaitForPathRespectsContextCancellation pins the ctx.Done()
// branch — a canceled request must not be blocked by the poll
// interval.
func TestWaitForPathRespectsContextCancellation(t *testing.T) {
missing := filepath.Join(t.TempDir(), "never.sock")
ctx, cancel := context.WithCancel(context.Background())
go func() {
time.Sleep(30 * time.Millisecond)
cancel()
}()
err := waitForPath(ctx, missing, 5*time.Second, "api socket")
if !errors.Is(err, context.Canceled) {
t.Fatalf("err = %v, want context.Canceled when ctx is cancelled mid-wait", err)
}
}
// TestEnsureSocketAccessChmodFailureBubbles verifies the chmod step
// fails fast before any ownership handoff. Once chown runs, the
// bounded helper no longer owns the socket and can't tighten its mode
// without CAP_FOWNER, so the order matters.
func TestEnsureSocketAccessChmodFailureBubbles(t *testing.T) {
socketPath := filepath.Join(t.TempDir(), "present.sock")
if err := os.WriteFile(socketPath, []byte{}, 0o600); err != nil {
t.Fatalf("WriteFile: %v", err)
}
chmodErr := errors.New("sudo chmod failed")
runner := &scriptedRunner{
t: t,
sudos: []scriptedCall{{err: chmodErr}},
}
mgr := New(runner, Config{}, slog.Default())
err := mgr.EnsureSocketAccess(context.Background(), socketPath, "api socket")
if !errors.Is(err, chmodErr) {
t.Fatalf("err = %v, want chmod error", err)
}
// chown must not have been attempted.
if len(runner.sudos) != 0 {
t.Fatalf("chown was attempted after chmod failed: %d sudo calls left", len(runner.sudos))
}
}
// TestEnsureSocketAccessChownFailureBubbles verifies the ownership
// handoff still surfaces errors after chmod succeeds.
func TestEnsureSocketAccessChownFailureBubbles(t *testing.T) {
socketPath := filepath.Join(t.TempDir(), "present.sock")
if err := os.WriteFile(socketPath, []byte{}, 0o600); err != nil {
t.Fatalf("WriteFile: %v", err)
}
chownErr := errors.New("sudo chown failed")
runner := &scriptedRunner{
t: t,
sudos: []scriptedCall{
{}, // chmod succeeds
{err: chownErr}, // chown fails
},
}
mgr := New(runner, Config{}, slog.Default())
err := mgr.EnsureSocketAccess(context.Background(), socketPath, "api socket")
if !errors.Is(err, chownErr) {
t.Fatalf("err = %v, want chown error", err)
}
}
// TestEnsureSocketAccessTimesOutBeforeTouchingRunner pins the
// ordering contract: if waitForPath never sees the socket, the
// sudo commands must not run. Running chown/chmod against a
// non-existent path would just noise the logs.
func TestEnsureSocketAccessTimesOutBeforeTouchingRunner(t *testing.T) {
missing := filepath.Join(t.TempDir(), "never.sock")
runner := &scriptedRunner{t: t} // no scripted calls — any runner invocation fails the test
mgr := New(runner, Config{}, slog.Default())
// EnsureSocketAccess's waitForPath has a hardcoded 5s timeout,
// and we can't inject a shorter one without widening the API.
// Use a short context instead — cancellation short-circuits
// waitForPath via the ctx.Done() branch.
ctx, cancel := context.WithTimeout(context.Background(), 150*time.Millisecond)
defer cancel()
err := mgr.EnsureSocketAccess(ctx, missing, "api socket")
if err == nil {
t.Fatal("EnsureSocketAccess: want error when socket never appears")
}
}
// TestEnsureSocketAccessForAsyncReturnsImmediatelyWhenNoPaths pins the
// fast-path: callers can hand the helper an empty list (e.g. when VSockPath
// is unset) and get a no-op channel back without spinning a goroutine.
func TestEnsureSocketAccessForAsyncReturnsImmediatelyWhenNoPaths(t *testing.T) {
runner := &scriptedRunner{t: t} // any runner call would fail the test
mgr := New(runner, Config{}, slog.Default())
done := mgr.EnsureSocketAccessForAsync(context.Background(), []string{"", " "}, 1000, 1000)
select {
case err := <-done:
if err != nil {
t.Fatalf("got %v, want nil for empty input", err)
}
case <-time.After(time.Second):
t.Fatal("EnsureSocketAccessForAsync did not signal completion")
}
}
// TestEnsureSocketAccessForAsyncWaitsForSocketThenChowns pins the boot-time
// race fix: while Machine.Start spins up firecracker, the helper polls for the
// socket and runs chmod + chown the moment it appears. If this drifts, the
// SDK's HTTP probe gets EACCES on a root-owned socket and Start times out.
func TestEnsureSocketAccessForAsyncWaitsForSocketThenChowns(t *testing.T) {
socketPath := filepath.Join(t.TempDir(), "delayed.sock")
go func() {
time.Sleep(50 * time.Millisecond)
_ = os.WriteFile(socketPath, []byte{}, 0o600)
}()
runner := &scriptedRunner{
t: t,
sudos: []scriptedCall{
{}, // chmod 600
{}, // chown uid:gid
},
}
mgr := New(runner, Config{}, slog.Default())
done := mgr.EnsureSocketAccessForAsync(context.Background(), []string{socketPath}, 4242, 4242)
select {
case err := <-done:
if err != nil {
t.Fatalf("EnsureSocketAccessForAsync: %v", err)
}
case <-time.After(2 * time.Second):
t.Fatal("EnsureSocketAccessForAsync did not signal completion")
}
if len(runner.sudos) != 0 {
t.Fatalf("expected both chmod and chown to run, %d sudo calls remaining", len(runner.sudos))
}
}
func contains(s, sub string) bool {
for i := 0; i+len(sub) <= len(s); i++ {
if s[i:i+len(sub)] == sub {
return true
}
}
return false
}