banger/internal/system/system.go
Thales Maciel 6b543cb17f
firecracker: adopt firecracker-jailer for VM launch (Phase B)
Each VM's firecracker now runs inside a per-VM chroot dropped to the
registered owner UID via firecracker-jailer. Closes the broad ambient-
sudo escalation surface that survived Phase A: the helper still needs
caps for tap/bridge/dm/loop/iptables, but the VMM itself no longer
runs as root in the host root filesystem.

The host helper stages each chroot up front: hard-links the kernel
and (optional) initrd, mknods block-device drives + /dev/vhost-vsock,
copies in the firecracker binary (jailer opens it O_RDWR so a ro bind
fails with EROFS), and bind-mounts /usr/lib + /lib trees read-only so
the dynamic linker can resolve. Self-binds the chroot first so the
findmnt-guarded cleanup can recurse safely.

AF_UNIX sun_path is 108 bytes; the chroot path easily blows past that.
Daemon-side launch pre-symlinks the short request socket path to the
long chroot socket before Machine.Start so the SDK's poll/connect
sees the short path while the kernel resolves to the chroot socket.
--new-pid-ns is intentionally disabled — jailer's PID-namespace fork
makes the SDK see the parent exit and tear the API socket down too
early.

CapabilityBoundingSet for the helper expands to add CAP_FOWNER,
CAP_KILL, CAP_MKNOD, CAP_SETGID, CAP_SETUID, CAP_SYS_CHROOT alongside
the existing CAP_CHOWN/CAP_DAC_OVERRIDE/CAP_NET_ADMIN/CAP_NET_RAW/
CAP_SYS_ADMIN.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-28 14:38:07 -03:00

603 lines
15 KiB
Go

package system
import (
"bufio"
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"io"
"math"
"os"
"os/exec"
"path/filepath"
"runtime"
"strconv"
"strings"
"syscall"
"banger/internal/model"
)
type Runner struct{}
type CommandRunner interface {
Run(ctx context.Context, name string, args ...string) ([]byte, error)
RunSudo(ctx context.Context, args ...string) ([]byte, error)
}
// StdinRunner is a duck-typed extension to CommandRunner for callers
// that need to pipe stdin into a command (e.g. `debugfs -w -f -`). The
// real system.Runner implements it; test doubles don't need to unless
// they exercise this path.
type StdinRunner interface {
RunStdin(ctx context.Context, stdin io.Reader, name string, args ...string) ([]byte, error)
}
func NewRunner() Runner {
return Runner{}
}
// ExitCode extracts the process exit code from an error returned by
// Run/RunSudo. Returns -1 when the error isn't an *exec.ExitError
// (e.g. a context cancellation, the command wasn't found). Exposing
// this here keeps daemon-level callers out of os/exec — the
// shellout-policy test rejects direct imports outside system/cli/etc.
func ExitCode(err error) int {
if err == nil {
return 0
}
var exitErr *exec.ExitError
if errors.As(err, &exitErr) {
return exitErr.ExitCode()
}
return -1
}
func (Runner) Run(ctx context.Context, name string, args ...string) ([]byte, error) {
cmd := exec.CommandContext(ctx, name, args...)
var stdout bytes.Buffer
var stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
if err := cmd.Run(); err != nil {
if stderr.Len() > 0 {
return stdout.Bytes(), fmt.Errorf("%w: %s", err, strings.TrimSpace(stderr.String()))
}
return stdout.Bytes(), err
}
return stdout.Bytes(), nil
}
func (r Runner) RunSudo(ctx context.Context, args ...string) ([]byte, error) {
if os.Geteuid() == 0 {
if len(args) == 0 {
return nil, errors.New("command is required")
}
return r.Run(ctx, args[0], args[1:]...)
}
all := append([]string{"-n"}, args...)
return r.Run(ctx, "sudo", all...)
}
// RunStdin executes name with args and pipes stdin in from the provided
// reader. Used for commands like debugfs -w that accept a scripted
// command stream on stdin.
func (Runner) RunStdin(ctx context.Context, stdin io.Reader, name string, args ...string) ([]byte, error) {
cmd := exec.CommandContext(ctx, name, args...)
var stdout bytes.Buffer
var stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
cmd.Stdin = stdin
if err := cmd.Run(); err != nil {
if stderr.Len() > 0 {
return stdout.Bytes(), fmt.Errorf("%w: %s", err, strings.TrimSpace(stderr.String()))
}
return stdout.Bytes(), err
}
return stdout.Bytes(), nil
}
func EnsureSudo(ctx context.Context) error {
if os.Geteuid() == 0 {
return nil
}
cmd := exec.CommandContext(ctx, "sudo", "-v")
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
cmd.Stdin = os.Stdin
return cmd.Run()
}
func CheckSudo(ctx context.Context) error {
if os.Geteuid() == 0 {
return nil
}
if _, err := exec.LookPath("sudo"); err != nil {
return err
}
cmd := exec.CommandContext(ctx, "sudo", "-n", "-v")
var stderr bytes.Buffer
cmd.Stderr = &stderr
if err := cmd.Run(); err != nil {
if stderr.Len() > 0 {
return fmt.Errorf("%w: %s", err, strings.TrimSpace(stderr.String()))
}
return err
}
return nil
}
func RequireCommands(ctx context.Context, commands ...string) error {
for _, command := range commands {
if _, err := exec.LookPath(command); err != nil {
return fmt.Errorf("required command %q not found", command)
}
}
return nil
}
func LookupExecutable(name string) (string, error) {
return exec.LookPath(name)
}
func WriteJSON(path string, value any) error {
data, err := json.MarshalIndent(value, "", " ")
if err != nil {
return err
}
return os.WriteFile(path, data, 0o644)
}
func AllocatedBytes(path string) int64 {
info, err := os.Stat(path)
if err != nil {
return 0
}
stat, ok := info.Sys().(*syscall.Stat_t)
if !ok {
return info.Size()
}
return stat.Blocks * 512
}
func ProcessRunning(pid int, apiSock string) bool {
if pid <= 0 || apiSock == "" {
return false
}
data, err := os.ReadFile(filepath.Join("/proc", strconv.Itoa(pid), "cmdline"))
if err != nil {
return false
}
cmdline := strings.ReplaceAll(string(data), "\x00", " ")
if !strings.Contains(cmdline, "firecracker") {
return false
}
if strings.Contains(cmdline, apiSock) {
return true
}
// Jailer mode: apiSock is a symlink; firecracker's cmdline has the
// chroot-internal path (e.g. "/firecracker.socket"), not the host path.
// Fall back to matching the symlink target's base name.
if target, err := os.Readlink(apiSock); err == nil {
return strings.Contains(cmdline, filepath.Base(target))
}
return false
}
type ProcessStats struct {
CPUPercent float64
RSSBytes int64
VSZBytes int64
}
type HostResources struct {
CPUCount int
TotalMemoryBytes int64
}
type FilesystemUsage struct {
TotalBytes int64
FreeBytes int64
}
func ReadProcessStats(ctx context.Context, pid int) (ProcessStats, error) {
_ = ctx
if pid <= 0 {
return ProcessStats{}, errors.New("pid is required")
}
statData, err := os.ReadFile(filepath.Join("/proc", strconv.Itoa(pid), "stat"))
if err != nil {
return ProcessStats{}, err
}
statmData, err := os.ReadFile(filepath.Join("/proc", strconv.Itoa(pid), "statm"))
if err != nil {
return ProcessStats{}, err
}
uptimeData, err := os.ReadFile("/proc/uptime")
if err != nil {
return ProcessStats{}, err
}
procStat, err := parseProcStat(string(statData))
if err != nil {
return ProcessStats{}, err
}
memStat, err := parseProcStatm(string(statmData))
if err != nil {
return ProcessStats{}, err
}
uptimeSeconds, err := parseProcUptime(string(uptimeData))
if err != nil {
return ProcessStats{}, err
}
const ticksPerSecond = 100.0
elapsedSeconds := uptimeSeconds - (float64(procStat.startTicks) / ticksPerSecond)
cpuPercent := 0.0
if elapsedSeconds > 0 {
totalCPUSeconds := float64(procStat.userTicks+procStat.systemTicks) / ticksPerSecond
cpuPercent = math.Max(0, (totalCPUSeconds/elapsedSeconds)*100)
}
pageSize := int64(os.Getpagesize())
return ProcessStats{
CPUPercent: cpuPercent,
RSSBytes: memStat.residentPages * pageSize,
VSZBytes: memStat.sizePages * pageSize,
}, nil
}
func ReadHostResources() (HostResources, error) {
data, err := os.ReadFile("/proc/meminfo")
if err != nil {
return HostResources{}, err
}
totalMemoryBytes, err := parseMemTotal(string(data))
if err != nil {
return HostResources{}, err
}
return HostResources{
CPUCount: runtime.NumCPU(),
TotalMemoryBytes: totalMemoryBytes,
}, nil
}
func ReadFilesystemUsage(path string) (FilesystemUsage, error) {
if strings.TrimSpace(path) == "" {
return FilesystemUsage{}, errors.New("filesystem path is required")
}
var stat syscall.Statfs_t
if err := syscall.Statfs(path, &stat); err != nil {
return FilesystemUsage{}, err
}
return FilesystemUsage{
TotalBytes: int64(stat.Blocks) * int64(stat.Bsize),
FreeBytes: int64(stat.Bavail) * int64(stat.Bsize),
}, nil
}
func TailCommand(path string, follow bool) *exec.Cmd {
if follow {
return exec.Command("tail", "-f", path)
}
return exec.Command("cat", path)
}
func ParseMetricsFile(path string) map[string]any {
data, err := os.ReadFile(path)
if err != nil || len(bytes.TrimSpace(data)) == 0 {
return nil
}
raw := bytes.TrimSpace(data)
var result map[string]any
if err := json.Unmarshal(raw, &result); err == nil {
return result
}
lastLine := lastJSONLine(raw)
if lastLine == nil {
return nil
}
if err := json.Unmarshal(lastLine, &result); err != nil {
return nil
}
return result
}
func parseMemTotal(data string) (int64, error) {
scanner := bufio.NewScanner(strings.NewReader(data))
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if !strings.HasPrefix(line, "MemTotal:") {
continue
}
fields := strings.Fields(line)
if len(fields) < 2 {
return 0, errors.New("meminfo MemTotal is malformed")
}
value, err := strconv.ParseInt(fields[1], 10, 64)
if err != nil {
return 0, fmt.Errorf("parse meminfo MemTotal: %w", err)
}
unit := "kB"
if len(fields) >= 3 {
unit = fields[2]
}
switch unit {
case "kB":
return value * 1024, nil
case "mB", "MB":
return value * 1024 * 1024, nil
case "gB", "GB":
return value * 1024 * 1024 * 1024, nil
default:
return 0, fmt.Errorf("unsupported meminfo unit %q", unit)
}
}
if err := scanner.Err(); err != nil {
return 0, err
}
return 0, errors.New("meminfo MemTotal not found")
}
func lastJSONLine(data []byte) []byte {
scanner := bufio.NewScanner(bytes.NewReader(data))
var last []byte
for scanner.Scan() {
line := bytes.TrimSpace(scanner.Bytes())
if len(line) == 0 {
continue
}
last = append([]byte(nil), line...)
}
return last
}
func CopyDirContents(ctx context.Context, runner CommandRunner, sourceDir, targetDir string, useSudo bool) error {
// Trailing "/." on the source tells cp -a to copy the directory's
// contents rather than the directory itself. filepath.Join would
// strip the dot, hence the manual concat.
args := []string{"-a", strings.TrimRight(sourceDir, "/") + "/.", targetDir + "/"}
var err error
if useSudo {
_, err = runner.RunSudo(ctx, append([]string{"cp"}, args...)...)
} else {
_, err = runner.Run(ctx, "cp", args...)
}
return err
}
func ResizeExt4Image(ctx context.Context, runner CommandRunner, path string, bytes int64) error {
if err := os.Truncate(path, bytes); err != nil {
return err
}
if _, err := runner.Run(ctx, "e2fsck", "-p", "-f", path); err != nil {
return err
}
_, err := runner.Run(ctx, "resize2fs", path)
return err
}
func ReadDebugFSText(ctx context.Context, runner CommandRunner, imagePath, guestPath string) (string, error) {
out, err := runner.Run(ctx, "debugfs", "-R", "cat "+guestPath, imagePath)
if err != nil {
return "", err
}
return string(out), nil
}
func WriteExt4File(ctx context.Context, runner CommandRunner, imagePath, guestPath string, data []byte) error {
return WriteExt4FileMode(ctx, runner, imagePath, guestPath, 0o600, data)
}
func WriteExt4FileMode(ctx context.Context, runner CommandRunner, imagePath, guestPath string, mode os.FileMode, data []byte) error {
tmp, err := os.CreateTemp("", "banger-ext4-*")
if err != nil {
return err
}
defer os.Remove(tmp.Name())
if _, err := tmp.Write(data); err != nil {
_ = tmp.Close()
return err
}
if err := tmp.Close(); err != nil {
return err
}
if err := os.Chmod(tmp.Name(), mode); err != nil {
return err
}
_, _ = runner.RunSudo(ctx, "e2rm", imagePath+":"+guestPath)
if _, err := runner.RunSudo(ctx, "e2cp", tmp.Name(), imagePath+":"+guestPath); err != nil {
return err
}
if mode.Perm()&0o111 != 0 {
modeValue := fmt.Sprintf("%#o", uint32(0o100000|mode.Perm()))
if _, err := runner.RunSudo(ctx, "debugfs", "-w", "-R", "sif "+guestPath+" mode "+modeValue, imagePath); err != nil {
return err
}
}
return nil
}
func MountTempDir(ctx context.Context, runner CommandRunner, source string, readOnly bool) (string, func() error, error) {
mountDir, err := os.MkdirTemp("", "banger-mnt-*")
if err != nil {
return "", nil, err
}
args := []string{"mount"}
var opts []string
if readOnly {
opts = append(opts, "ro")
}
if useLoopMount(source) {
opts = append(opts, "loop")
}
if len(opts) > 0 {
args = append(args, "-o", strings.Join(opts, ","))
}
args = append(args, source, mountDir)
if _, err := runner.RunSudo(ctx, args...); err != nil {
_ = os.RemoveAll(mountDir)
return "", nil, err
}
cleanup := func() error {
_, err := runner.RunSudo(context.Background(), "umount", mountDir)
_ = os.RemoveAll(mountDir)
return err
}
return mountDir, cleanup, nil
}
func useLoopMount(source string) bool {
info, err := os.Stat(source)
if err != nil {
return false
}
return info.Mode().IsRegular()
}
func UpdateFSTab(existing string) string {
lines := strings.Split(existing, "\n")
var out []string
hasRoot := false
hasRun := false
hasTmp := false
for _, line := range lines {
trimmed := strings.TrimSpace(line)
if trimmed == "" {
continue
}
fields := strings.Fields(trimmed)
if len(fields) >= 2 {
if fields[0] == "/dev/vdb" && fields[1] == "/home" {
continue
}
if fields[0] == "/dev/vdc" && fields[1] == "/var" {
continue
}
if fields[0] == "/dev/vdb" && fields[1] == "/root" {
hasRoot = true
}
if fields[0] == "tmpfs" && fields[1] == "/run" {
hasRun = true
}
if fields[0] == "tmpfs" && fields[1] == "/tmp" {
hasTmp = true
}
}
out = append(out, line)
}
if !hasRoot {
out = append(out, "/dev/vdb /root ext4 defaults 0 2")
}
if !hasRun {
out = append(out, "tmpfs /run tmpfs defaults,nodev,nosuid,mode=0755 0 0")
}
if !hasTmp {
out = append(out, "tmpfs /tmp tmpfs defaults,nodev,nosuid,mode=1777 0 0")
}
return strings.Join(out, "\n") + "\n"
}
func BuildBootArgs(vmName string) string {
return fmt.Sprintf(
"console=ttyS0 reboot=k panic=1 pci=off root=/dev/vda rootfstype=ext4 rw hostname=%s systemd.mask=home.mount systemd.mask=var.mount",
vmName,
)
}
func BuildBootArgsWithKernelIP(vmName, guestIP, bridgeIP, dns string) string {
return fmt.Sprintf(
"console=ttyS0 reboot=k panic=1 pci=off root=/dev/vda rootfstype=ext4 rw ip=%s::%s:255.255.255.0:%s:eth0:off:%s hostname=%s systemd.mask=home.mount systemd.mask=var.mount",
guestIP,
bridgeIP,
vmName,
dns,
vmName,
)
}
func ShortID(id string) string {
if len(id) <= 8 {
return id
}
return id[:8]
}
func TouchNow(vm *model.VMRecord) {
now := model.Now()
vm.UpdatedAt = now
vm.LastTouchedAt = now
}
func CopyStream(dst io.Writer, cmd *exec.Cmd) error {
cmd.Stdout = dst
cmd.Stderr = os.Stderr
cmd.Stdin = os.Stdin
return cmd.Run()
}
type procStat struct {
userTicks uint64
systemTicks uint64
startTicks uint64
}
type procStatm struct {
sizePages int64
residentPages int64
}
func parseProcStat(raw string) (procStat, error) {
raw = strings.TrimSpace(raw)
end := strings.LastIndex(raw, ")")
if end == -1 || end+2 >= len(raw) {
return procStat{}, fmt.Errorf("unexpected /proc stat format: %q", raw)
}
fields := strings.Fields(raw[end+2:])
if len(fields) < 20 {
return procStat{}, fmt.Errorf("unexpected /proc stat field count: %q", raw)
}
userTicks, err := strconv.ParseUint(fields[11], 10, 64)
if err != nil {
return procStat{}, err
}
systemTicks, err := strconv.ParseUint(fields[12], 10, 64)
if err != nil {
return procStat{}, err
}
startTicks, err := strconv.ParseUint(fields[19], 10, 64)
if err != nil {
return procStat{}, err
}
return procStat{
userTicks: userTicks,
systemTicks: systemTicks,
startTicks: startTicks,
}, nil
}
func parseProcStatm(raw string) (procStatm, error) {
fields := strings.Fields(strings.TrimSpace(raw))
if len(fields) < 2 {
return procStatm{}, fmt.Errorf("unexpected /proc statm format: %q", raw)
}
sizePages, err := strconv.ParseInt(fields[0], 10, 64)
if err != nil {
return procStatm{}, err
}
residentPages, err := strconv.ParseInt(fields[1], 10, 64)
if err != nil {
return procStatm{}, err
}
return procStatm{sizePages: sizePages, residentPages: residentPages}, nil
}
func parseProcUptime(raw string) (float64, error) {
fields := strings.Fields(strings.TrimSpace(raw))
if len(fields) == 0 {
return 0, fmt.Errorf("unexpected /proc uptime format: %q", raw)
}
return strconv.ParseFloat(fields[0], 64)
}