From fba30f26d4852a6b8df3d94ab89d2629286d905c Mon Sep 17 00:00:00 2001 From: Thales Maciel Date: Wed, 22 Apr 2026 16:09:02 -0300 Subject: [PATCH] firecracker: chown API + vsock sockets inside the sudo shell MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bug: Firecracker creates its API and vsock sockets as root:root 0700 (enforced by the intentional umask 077 in buildProcessRunner). The daemon, running as the invoking user, then can't connect(2) to either — AF_UNIX connect needs write permission on the socket file and 0700 root-owned leaves thales without any. firecracker-go-sdk's Machine.Start() blocks on waitForSocket, which probes the socket with both os.Stat (succeeds — parent dir is the user's XDG_RUNTIME_DIR) and an HTTP GET over the socket (fails — EACCES on connect). The SDK loops for 3 seconds then fails with "Firecracker did not create API socket ... context deadline exceeded". The daemon's EnsureSocketAccess chown was meant to fix permissions, but it runs *after* Machine.Start returns — and Start never returns because it's still looping on the SDK's probe. Chicken-and-egg. Fix: inside the sudo'd shell that launches firecracker, spawn a background subshell that polls for each expected socket (API + vsock, when configured) and chowns it to $SUDO_UID:$SUDO_GID as soon as it appears. The background polling is bounded at 1s (20 × 50ms) so a broken firecracker invocation doesn't leak a waiting shell. Post-fix: socket appears root-owned 0600 briefly, is chowned to the invoking user within ~50ms, SDK's HTTP probe succeeds, Machine.Start returns normally. EnsureSocketAccess's later chmod 600 remains the belt-and-braces guarantee on final mode. Verified: manual repro of the shell script produces a socket owned by thales:thales that a non-root python socket.connect() accepts. Without the fix the same setup gives "PermissionError: [Errno 13] Permission denied". Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/firecracker/client.go | 48 +++++++++++++++++++++++------ internal/firecracker/client_test.go | 40 +++++++++++++++++++++--- 2 files changed, 74 insertions(+), 14 deletions(-) diff --git a/internal/firecracker/client.go b/internal/firecracker/client.go index b2c3521..328dc40 100644 --- a/internal/firecracker/client.go +++ b/internal/firecracker/client.go @@ -184,17 +184,47 @@ func defaultDriveID(drive DriveConfig, fallback string) string { } func buildProcessRunner(cfg MachineConfig, logFile *os.File) *exec.Cmd { - // umask 077 so the API + vsock sockets firecracker creates are - // mode 0600 from birth (owned by root since we invoke via sudo). - // A follow-up chown in fcproc.EnsureSocketAccess transfers - // ownership to the invoking user. Without this, the sockets - // would briefly exist world-readable/writable between firecracker - // creating them and the daemon tightening the mode — a real - // window for a local attacker to hit the control plane. - script := "umask 077 && exec " + shellQuote(cfg.BinaryPath) + + // Two moving parts, run inside a single sudo'd shell: + // + // 1. umask 077 + exec firecracker → the API and vsock sockets + // firecracker creates are born 0600 owned by root (sudo user), + // not 0755. Without the umask there's a real window where a + // local attacker could hit the control plane. + // + // 2. A background subshell polls for each expected socket and + // chowns it to $SUDO_UID:$SUDO_GID as soon as it appears. + // + // The chown is required *before* the firecracker-go-sdk's + // waitForSocket returns from Machine.Start — the SDK does both an + // os.Stat and an HTTP GET over the socket, and AF_UNIX connect(2) + // needs write permission on the socket file. With the socket at + // 0600 root:root, the daemon process (running as the invoking + // user) gets EACCES on connect and the SDK loops until its 3s + // timeout. The daemon's post-Start EnsureSocketAccess chown would + // fix it, but Start never returns to hand control back. + // + // Racing the chown inside sudo's shell closes the gap: by the + // time the SDK's HTTP probe fires, the socket is already owned by + // the invoking user. + chownWatcher := func(path string) string { + // Bounded poll: 20 × 50ms = 1s. Matches the SDK's 3s wait + // budget with headroom and bails quietly if firecracker + // never creates the socket (e.g. bad args — the error + // surfaces through firecracker's non-zero exit). + return `for _ in $(seq 1 20); do [ -S ` + shellQuote(path) + ` ] && break; sleep 0.05; done; ` + + `[ -S ` + shellQuote(path) + ` ] && chown "$SUDO_UID:$SUDO_GID" ` + shellQuote(path) + ` || true` + } + watchers := chownWatcher(cfg.SocketPath) + if strings.TrimSpace(cfg.VSockPath) != "" { + watchers += "; " + chownWatcher(cfg.VSockPath) + } + script := "umask 077 && (" + watchers + ") & exec " + shellQuote(cfg.BinaryPath) + " --api-sock " + shellQuote(cfg.SocketPath) + " --id " + shellQuote(cfg.VMID) - cmd := exec.Command("sudo", "-n", "sh", "-c", script) + // sudo -E preserves SUDO_UID / SUDO_GID (sudo sets them itself + // regardless, but -E is already the convention in this codebase + // and the background subshell needs them). + cmd := exec.Command("sudo", "-n", "-E", "sh", "-c", script) cmd.Stdin = nil if logFile != nil { cmd.Stdout = logFile diff --git a/internal/firecracker/client_test.go b/internal/firecracker/client_test.go index dda9497..9c5b300 100644 --- a/internal/firecracker/client_test.go +++ b/internal/firecracker/client_test.go @@ -76,27 +76,57 @@ func TestBuildProcessRunnerUsesSudoShellWrapper(t *testing.T) { cmd := buildProcessRunner(MachineConfig{ BinaryPath: "/repo/firecracker", SocketPath: "/tmp/fc.sock", + VSockPath: "/tmp/vsock.sock", VMID: "vm-1", }, nil) if cmd.Path != "/usr/bin/sudo" && cmd.Path != "sudo" { t.Fatalf("command path = %q", cmd.Path) } - if len(cmd.Args) != 5 { + if len(cmd.Args) != 6 { t.Fatalf("args = %v", cmd.Args) } - if cmd.Args[1] != "-n" || cmd.Args[2] != "sh" || cmd.Args[3] != "-c" { + if cmd.Args[1] != "-n" || cmd.Args[2] != "-E" || cmd.Args[3] != "sh" || cmd.Args[4] != "-c" { t.Fatalf("args = %v", cmd.Args) } - want := "umask 077 && exec '/repo/firecracker' --api-sock '/tmp/fc.sock' --id 'vm-1'" - if cmd.Args[4] != want { - t.Fatalf("script = %q, want %q", cmd.Args[4], want) + script := cmd.Args[5] + + // The firecracker exec must run in the foreground so its exit + // status propagates through sh back to the SDK. + if !strings.Contains(script, "exec '/repo/firecracker' --api-sock '/tmp/fc.sock' --id 'vm-1'") { + t.Fatalf("script missing firecracker exec: %q", script) + } + // umask stays — the security intent is unchanged. + if !strings.Contains(script, "umask 077") { + t.Fatalf("script dropped umask 077: %q", script) + } + // Background watcher chowns both the API socket and the vsock + // socket to the invoking user as soon as they appear, so + // firecracker-go-sdk's waitForSocket HTTP probe (which needs + // connect access) isn't blocked on root-owned sockets. + if !strings.Contains(script, `chown "$SUDO_UID:$SUDO_GID" '/tmp/fc.sock'`) { + t.Fatalf("script missing API-socket chown: %q", script) + } + if !strings.Contains(script, `chown "$SUDO_UID:$SUDO_GID" '/tmp/vsock.sock'`) { + t.Fatalf("script missing vsock-socket chown: %q", script) } if cmd.Cancel != nil { t.Fatal("process runner should not be tied to a request context") } } +func TestBuildProcessRunnerOmitsVSockChownWhenUnset(t *testing.T) { + cmd := buildProcessRunner(MachineConfig{ + BinaryPath: "/repo/firecracker", + SocketPath: "/tmp/fc.sock", + VMID: "vm-1", + }, nil) + script := cmd.Args[5] + if strings.Contains(script, "vsock") { + t.Fatalf("script should not mention vsock when VSockPath is empty: %q", script) + } +} + func TestSDKLoggerBridgeEmitsStructuredDebugLogs(t *testing.T) { var buf bytes.Buffer logger := slog.New(slog.NewJSONHandler(&buf, &slog.HandlerOptions{Level: slog.LevelDebug}))