firecracker: chown API + vsock sockets inside the sudo shell

Bug: Firecracker creates its API and vsock sockets as root:root 0700
(enforced by the intentional umask 077 in buildProcessRunner). The
daemon, running as the invoking user, then can't connect(2) to
either — AF_UNIX connect needs write permission on the socket file
and 0700 root-owned leaves thales without any.

firecracker-go-sdk's Machine.Start() blocks on waitForSocket, which
probes the socket with both os.Stat (succeeds — parent dir is the
user's XDG_RUNTIME_DIR) and an HTTP GET over the socket (fails —
EACCES on connect). The SDK loops for 3 seconds then fails with
"Firecracker did not create API socket ... context deadline exceeded".

The daemon's EnsureSocketAccess chown was meant to fix permissions,
but it runs *after* Machine.Start returns — and Start never returns
because it's still looping on the SDK's probe. Chicken-and-egg.

Fix: inside the sudo'd shell that launches firecracker, spawn a
background subshell that polls for each expected socket (API + vsock,
when configured) and chowns it to $SUDO_UID:$SUDO_GID as soon as it
appears. The background polling is bounded at 1s (20 × 50ms) so a
broken firecracker invocation doesn't leak a waiting shell.

Post-fix: socket appears root-owned 0600 briefly, is chowned to the
invoking user within ~50ms, SDK's HTTP probe succeeds, Machine.Start
returns normally. EnsureSocketAccess's later chmod 600 remains the
belt-and-braces guarantee on final mode.

Verified: manual repro of the shell script produces a socket owned
by thales:thales that a non-root python socket.connect() accepts.
Without the fix the same setup gives "PermissionError: [Errno 13]
Permission denied".

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Thales Maciel 2026-04-22 16:09:02 -03:00
parent 60f90eb8be
commit fba30f26d4
No known key found for this signature in database
GPG key ID: 33112E6833C34679
2 changed files with 74 additions and 14 deletions

View file

@ -76,27 +76,57 @@ func TestBuildProcessRunnerUsesSudoShellWrapper(t *testing.T) {
cmd := buildProcessRunner(MachineConfig{
BinaryPath: "/repo/firecracker",
SocketPath: "/tmp/fc.sock",
VSockPath: "/tmp/vsock.sock",
VMID: "vm-1",
}, nil)
if cmd.Path != "/usr/bin/sudo" && cmd.Path != "sudo" {
t.Fatalf("command path = %q", cmd.Path)
}
if len(cmd.Args) != 5 {
if len(cmd.Args) != 6 {
t.Fatalf("args = %v", cmd.Args)
}
if cmd.Args[1] != "-n" || cmd.Args[2] != "sh" || cmd.Args[3] != "-c" {
if cmd.Args[1] != "-n" || cmd.Args[2] != "-E" || cmd.Args[3] != "sh" || cmd.Args[4] != "-c" {
t.Fatalf("args = %v", cmd.Args)
}
want := "umask 077 && exec '/repo/firecracker' --api-sock '/tmp/fc.sock' --id 'vm-1'"
if cmd.Args[4] != want {
t.Fatalf("script = %q, want %q", cmd.Args[4], want)
script := cmd.Args[5]
// The firecracker exec must run in the foreground so its exit
// status propagates through sh back to the SDK.
if !strings.Contains(script, "exec '/repo/firecracker' --api-sock '/tmp/fc.sock' --id 'vm-1'") {
t.Fatalf("script missing firecracker exec: %q", script)
}
// umask stays — the security intent is unchanged.
if !strings.Contains(script, "umask 077") {
t.Fatalf("script dropped umask 077: %q", script)
}
// Background watcher chowns both the API socket and the vsock
// socket to the invoking user as soon as they appear, so
// firecracker-go-sdk's waitForSocket HTTP probe (which needs
// connect access) isn't blocked on root-owned sockets.
if !strings.Contains(script, `chown "$SUDO_UID:$SUDO_GID" '/tmp/fc.sock'`) {
t.Fatalf("script missing API-socket chown: %q", script)
}
if !strings.Contains(script, `chown "$SUDO_UID:$SUDO_GID" '/tmp/vsock.sock'`) {
t.Fatalf("script missing vsock-socket chown: %q", script)
}
if cmd.Cancel != nil {
t.Fatal("process runner should not be tied to a request context")
}
}
func TestBuildProcessRunnerOmitsVSockChownWhenUnset(t *testing.T) {
cmd := buildProcessRunner(MachineConfig{
BinaryPath: "/repo/firecracker",
SocketPath: "/tmp/fc.sock",
VMID: "vm-1",
}, nil)
script := cmd.Args[5]
if strings.Contains(script, "vsock") {
t.Fatalf("script should not mention vsock when VSockPath is empty: %q", script)
}
}
func TestSDKLoggerBridgeEmitsStructuredDebugLogs(t *testing.T) {
var buf bytes.Buffer
logger := slog.New(slog.NewJSONHandler(&buf, &slog.HandlerOptions{Level: slog.LevelDebug}))