Move the supported systemd path to two services: an owner-user bangerd for orchestration and a narrow root helper for bridge/tap, NAT/resolver, dm/loop, and Firecracker ownership. This removes repeated sudo from daily vm and image flows without leaving the general daemon running as root. Add install metadata, system install/status/restart/uninstall commands, and a system-owned runtime layout. Keep user SSH/config material in the owner home, lock file_sync to the owner home, and move daemon known_hosts handling out of the old root-owned control path. Route privileged lifecycle steps through typed privilegedOps calls, harden the two systemd units, and rewrite smoke plus docs around the supported service model. Verified with make build, make test, make lint, and make smoke on the supported systemd host path.
233 lines
7.7 KiB
Go
233 lines
7.7 KiB
Go
// Package fcproc owns the host-side process primitives needed to launch,
|
|
// inspect, and tear down Firecracker VMs: bridge/tap setup, binary
|
|
// resolution, socket permissions, PID lookup, graceful and forceful
|
|
// shutdown. Shared by the VM lifecycle and image build paths so neither
|
|
// needs to import the other.
|
|
package fcproc
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"log/slog"
|
|
"os"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
"banger/internal/firecracker"
|
|
"banger/internal/system"
|
|
)
|
|
|
|
// ErrWaitForExitTimeout is returned by WaitForExit when the deadline passes
|
|
// before the process exits. Callers use errors.Is to detect it.
|
|
var ErrWaitForExitTimeout = errors.New("timed out waiting for VM to exit")
|
|
|
|
// Runner is the command-runner surface fcproc needs. system.Runner satisfies
|
|
// it.
|
|
type Runner interface {
|
|
Run(ctx context.Context, name string, args ...string) ([]byte, error)
|
|
RunSudo(ctx context.Context, args ...string) ([]byte, error)
|
|
}
|
|
|
|
// Config captures the host networking + runtime paths fcproc operations need.
|
|
type Config struct {
|
|
FirecrackerBin string
|
|
BridgeName string
|
|
BridgeIP string
|
|
CIDR string
|
|
RuntimeDir string
|
|
}
|
|
|
|
// Manager owns the shared configuration + runner and exposes the per-process
|
|
// helpers. Stateless beyond its dependencies — safe to share.
|
|
type Manager struct {
|
|
runner Runner
|
|
cfg Config
|
|
logger *slog.Logger
|
|
}
|
|
|
|
// New returns a Manager that issues commands through runner using cfg.
|
|
func New(runner Runner, cfg Config, logger *slog.Logger) *Manager {
|
|
return &Manager{runner: runner, cfg: cfg, logger: logger}
|
|
}
|
|
|
|
// EnsureBridge makes sure the host bridge exists and is up.
|
|
func (m *Manager) EnsureBridge(ctx context.Context) error {
|
|
if _, err := m.runner.Run(ctx, "ip", "link", "show", m.cfg.BridgeName); err == nil {
|
|
_, err = m.runner.RunSudo(ctx, "ip", "link", "set", m.cfg.BridgeName, "up")
|
|
return err
|
|
}
|
|
if _, err := m.runner.RunSudo(ctx, "ip", "link", "add", "name", m.cfg.BridgeName, "type", "bridge"); err != nil {
|
|
return err
|
|
}
|
|
if _, err := m.runner.RunSudo(ctx, "ip", "addr", "add", fmt.Sprintf("%s/%s", m.cfg.BridgeIP, m.cfg.CIDR), "dev", m.cfg.BridgeName); err != nil {
|
|
return err
|
|
}
|
|
_, err := m.runner.RunSudo(ctx, "ip", "link", "set", m.cfg.BridgeName, "up")
|
|
return err
|
|
}
|
|
|
|
// EnsureSocketDir creates the runtime socket directory at 0700. This is
|
|
// the directory the daemon socket, per-VM firecracker API sockets, and
|
|
// vsock sockets all live inside, so it must be readable only by the
|
|
// invoking user.
|
|
func (m *Manager) EnsureSocketDir() error {
|
|
mode := os.FileMode(0o700)
|
|
if os.Geteuid() == 0 {
|
|
mode = 0o711
|
|
}
|
|
if err := os.MkdirAll(m.cfg.RuntimeDir, mode); err != nil {
|
|
return err
|
|
}
|
|
return os.Chmod(m.cfg.RuntimeDir, mode)
|
|
}
|
|
|
|
// CreateTap (re)creates a TAP owned by the current uid/gid, attaches it to
|
|
// the bridge, and brings both up.
|
|
func (m *Manager) CreateTap(ctx context.Context, tap string) error {
|
|
return m.CreateTapOwned(ctx, tap, os.Getuid(), os.Getgid())
|
|
}
|
|
|
|
// CreateTapOwned (re)creates a TAP owned by uid:gid, attaches it to the
|
|
// bridge, and brings both up.
|
|
func (m *Manager) CreateTapOwned(ctx context.Context, tap string, uid, gid int) error {
|
|
if _, err := m.runner.Run(ctx, "ip", "link", "show", tap); err == nil {
|
|
_, _ = m.runner.RunSudo(ctx, "ip", "link", "del", tap)
|
|
}
|
|
if _, err := m.runner.RunSudo(ctx, "ip", "tuntap", "add", "dev", tap, "mode", "tap", "user", strconv.Itoa(uid), "group", strconv.Itoa(gid)); err != nil {
|
|
return err
|
|
}
|
|
if _, err := m.runner.RunSudo(ctx, "ip", "link", "set", tap, "master", m.cfg.BridgeName); err != nil {
|
|
return err
|
|
}
|
|
if _, err := m.runner.RunSudo(ctx, "ip", "link", "set", tap, "up"); err != nil {
|
|
return err
|
|
}
|
|
_, err := m.runner.RunSudo(ctx, "ip", "link", "set", m.cfg.BridgeName, "up")
|
|
return err
|
|
}
|
|
|
|
// ResolveBinary returns the path to the firecracker binary: either an
|
|
// absolute path from config, or the first hit on PATH.
|
|
func (m *Manager) ResolveBinary() (string, error) {
|
|
if m.cfg.FirecrackerBin == "" {
|
|
return "", fmt.Errorf("firecracker binary not configured; install firecracker or set firecracker_bin")
|
|
}
|
|
path := m.cfg.FirecrackerBin
|
|
if strings.ContainsRune(path, os.PathSeparator) {
|
|
if _, err := os.Stat(path); err != nil {
|
|
return "", fmt.Errorf("firecracker binary not found at %s; install firecracker or set firecracker_bin", path)
|
|
}
|
|
return path, nil
|
|
}
|
|
resolved, err := system.LookupExecutable(path)
|
|
if err != nil {
|
|
return "", fmt.Errorf("firecracker binary %q not found in PATH; install firecracker or set firecracker_bin", path)
|
|
}
|
|
return resolved, nil
|
|
}
|
|
|
|
// EnsureSocketAccess waits for the socket to appear then chowns/chmods it to
|
|
// the current uid/gid, mode 0600.
|
|
func (m *Manager) EnsureSocketAccess(ctx context.Context, socketPath, label string) error {
|
|
return m.EnsureSocketAccessFor(ctx, socketPath, label, os.Getuid(), os.Getgid())
|
|
}
|
|
|
|
// EnsureSocketAccessFor waits for the socket to appear then chowns/chmods it
|
|
// to uid:gid, mode 0600.
|
|
func (m *Manager) EnsureSocketAccessFor(ctx context.Context, socketPath, label string, uid, gid int) error {
|
|
if err := waitForPath(ctx, socketPath, 5*time.Second, label); err != nil {
|
|
return err
|
|
}
|
|
if os.Geteuid() == 0 {
|
|
if _, err := m.runner.Run(ctx, "chmod", "600", socketPath); err != nil {
|
|
return err
|
|
}
|
|
_, err := m.runner.Run(ctx, "chown", fmt.Sprintf("%d:%d", uid, gid), socketPath)
|
|
return err
|
|
}
|
|
if _, err := m.runner.RunSudo(ctx, "chmod", "600", socketPath); err != nil {
|
|
return err
|
|
}
|
|
_, err := m.runner.RunSudo(ctx, "chown", fmt.Sprintf("%d:%d", uid, gid), socketPath)
|
|
return err
|
|
}
|
|
|
|
// FindPID returns the PID of the firecracker process listening on apiSock,
|
|
// located via pgrep.
|
|
func (m *Manager) FindPID(ctx context.Context, apiSock string) (int, error) {
|
|
out, err := m.runner.Run(ctx, "pgrep", "-n", "-f", apiSock)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
return strconv.Atoi(strings.TrimSpace(string(out)))
|
|
}
|
|
|
|
// ResolvePID prefers pgrep and falls back to the firecracker machine PID.
|
|
// Returns 0 if neither source yields a PID.
|
|
func (m *Manager) ResolvePID(ctx context.Context, machine *firecracker.Machine, apiSock string) int {
|
|
if pid, err := m.FindPID(ctx, apiSock); err == nil && pid > 0 {
|
|
return pid
|
|
}
|
|
if machine != nil {
|
|
if pid, err := machine.PID(); err == nil && pid > 0 {
|
|
return pid
|
|
}
|
|
}
|
|
return 0
|
|
}
|
|
|
|
// SendCtrlAltDel requests a graceful guest shutdown via the firecracker API
|
|
// socket.
|
|
func (m *Manager) SendCtrlAltDel(ctx context.Context, apiSock string) error {
|
|
if err := m.EnsureSocketAccess(ctx, apiSock, "firecracker api socket"); err != nil {
|
|
return err
|
|
}
|
|
client := firecracker.New(apiSock, m.logger)
|
|
return client.SendCtrlAltDel(ctx)
|
|
}
|
|
|
|
// WaitForExit polls until the process is gone or the timeout fires. Returns
|
|
// ErrWaitForExitTimeout on timeout, ctx.Err() on cancellation.
|
|
func (m *Manager) WaitForExit(ctx context.Context, pid int, apiSock string, timeout time.Duration) error {
|
|
deadline := time.Now().Add(timeout)
|
|
for {
|
|
if !system.ProcessRunning(pid, apiSock) {
|
|
return nil
|
|
}
|
|
if time.Now().After(deadline) {
|
|
return ErrWaitForExitTimeout
|
|
}
|
|
select {
|
|
case <-ctx.Done():
|
|
return ctx.Err()
|
|
case <-time.After(100 * time.Millisecond):
|
|
}
|
|
}
|
|
}
|
|
|
|
// Kill sends SIGKILL to pid.
|
|
func (m *Manager) Kill(ctx context.Context, pid int) error {
|
|
_, err := m.runner.RunSudo(ctx, "kill", "-KILL", strconv.Itoa(pid))
|
|
return err
|
|
}
|
|
|
|
func waitForPath(ctx context.Context, path string, timeout time.Duration, label string) error {
|
|
deadline := time.Now().Add(timeout)
|
|
for {
|
|
if _, err := os.Stat(path); err == nil {
|
|
return nil
|
|
} else if err != nil && !os.IsNotExist(err) {
|
|
return err
|
|
}
|
|
if time.Now().After(deadline) {
|
|
return fmt.Errorf("%s not ready: %s: %w", label, path, context.DeadlineExceeded)
|
|
}
|
|
select {
|
|
case <-ctx.Done():
|
|
return ctx.Err()
|
|
case <-time.After(100 * time.Millisecond):
|
|
}
|
|
}
|
|
}
|