banger/internal/daemon/capabilities.go
Thales Maciel 3edd7c6de7
daemon: build a work-seed during image pull, refresh doctor check
Before this change `banger image pull` (both OCI-direct and bundle
paths) shipped images with an empty WorkSeedPath — the BuildWorkSeedImage
helper existed only behind the hidden `banger internal work-seed` CLI.
Every pulled image hit ensureWorkDisk's no-seed branch, and the guest
booted with a bare /root (no .bashrc, no .profile, none of the distro
defaults).

Pull now calls BuildWorkSeedImage after the rootfs is finalised (OCI)
or fetched (bundle). The builder is behind a new `workSeedBuilder` test
seam so existing pull tests don't accidentally demand sudo mount. The
build failure is non-fatal: any error logs a warning and leaves
WorkSeedPath empty — images stay publishable even if the pulled rootfs
has no /root to extract.

Verified end-to-end by wiping the cached smoke image and re-pulling:
work-seed.ext4 lands in the artifact dir next to rootfs.ext4, and all
21 smoke scenarios pass.

Also refreshes the "feature /root work disk" fallback tooling check —
the no-seed path no longer touches mount/umount/cp after commit
0e28504, so the doctor check now only requires truncate + mkfs.ext4.
The warn copy updates from "new VM creates will be slower" to "guest
/root will be empty", which matches the actual tradeoff post-refactor.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-23 20:24:10 -03:00

373 lines
12 KiB
Go

package daemon
import (
"context"
"errors"
"log/slog"
"net"
"os"
"strings"
"banger/internal/firecracker"
"banger/internal/guestconfig"
"banger/internal/model"
"banger/internal/store"
"banger/internal/system"
"banger/internal/vmdns"
)
// vmCapability is the base capability tag. Actual behaviour lives on
// optional sub-interfaces (startPreflight / guestConfig / machineConfig
// / prepareHost / postStart / cleanup / configChange / doctor); a
// capability implements whichever subset it cares about. None of them
// take *Daemon — each capability is a struct constructed with its
// explicit service-pointer dependencies at wireServices time.
type vmCapability interface {
Name() string
}
type startPreflightCapability interface {
AddStartPreflight(context.Context, *system.Preflight, model.VMRecord, model.Image)
}
type guestConfigCapability interface {
ContributeGuest(*guestconfig.Builder, model.VMRecord, model.Image)
}
type machineConfigCapability interface {
ContributeMachine(*firecracker.MachineConfig, model.VMRecord, model.Image)
}
type prepareHostCapability interface {
PrepareHost(context.Context, *model.VMRecord, model.Image) error
}
type postStartCapability interface {
PostStart(context.Context, model.VMRecord, model.Image) error
}
type cleanupCapability interface {
Cleanup(context.Context, model.VMRecord) error
}
type configChangeCapability interface {
ApplyConfigChange(context.Context, model.VMRecord, model.VMRecord) error
}
type doctorCapability interface {
AddDoctorChecks(context.Context, *system.Report)
}
// defaultCapabilities builds the production capability list from
// already-constructed services. Called from wireServices once d.vm /
// d.ws / d.net are populated, so every capability ships with the
// concrete service pointers it needs and none of them reach through
// *Daemon at dispatch time.
func (d *Daemon) defaultCapabilities() []vmCapability {
return []vmCapability{
newWorkDiskCapability(d.vm, d.ws, d.store, d.config.DefaultImageName),
newDNSCapability(d.net),
newNATCapability(d.vm, d.net, d.logger),
}
}
func (d *Daemon) addCapabilityStartPrereqs(ctx context.Context, checks *system.Preflight, vm model.VMRecord, image model.Image) {
for _, capability := range d.vmCaps {
if hook, ok := capability.(startPreflightCapability); ok {
hook.AddStartPreflight(ctx, checks, vm, image)
}
}
}
func (d *Daemon) contributeGuestConfig(builder *guestconfig.Builder, vm model.VMRecord, image model.Image) {
for _, capability := range d.vmCaps {
if hook, ok := capability.(guestConfigCapability); ok {
hook.ContributeGuest(builder, vm, image)
}
}
}
func (d *Daemon) contributeMachineConfig(cfg *firecracker.MachineConfig, vm model.VMRecord, image model.Image) {
for _, capability := range d.vmCaps {
if hook, ok := capability.(machineConfigCapability); ok {
hook.ContributeMachine(cfg, vm, image)
}
}
}
func (d *Daemon) prepareCapabilityHosts(ctx context.Context, vm *model.VMRecord, image model.Image) error {
prepared := make([]vmCapability, 0, len(d.vmCaps))
for _, capability := range d.vmCaps {
hook, ok := capability.(prepareHostCapability)
if !ok {
continue
}
if err := hook.PrepareHost(ctx, vm, image); err != nil {
d.cleanupPreparedCapabilities(context.Background(), vm, prepared)
return err
}
prepared = append(prepared, capability)
}
return nil
}
func (d *Daemon) postStartCapabilities(ctx context.Context, vm model.VMRecord, image model.Image) error {
for _, capability := range d.vmCaps {
switch capability.Name() {
case "dns":
vmCreateStage(ctx, "apply_dns", "publishing vm dns record")
case "nat":
if vm.Spec.NATEnabled {
vmCreateStage(ctx, "apply_nat", "configuring nat")
}
}
if hook, ok := capability.(postStartCapability); ok {
if err := hook.PostStart(ctx, vm, image); err != nil {
return err
}
}
}
return nil
}
func (d *Daemon) cleanupCapabilityState(ctx context.Context, vm model.VMRecord) error {
return d.cleanupPreparedCapabilities(ctx, &vm, d.vmCaps)
}
func (d *Daemon) cleanupPreparedCapabilities(ctx context.Context, vm *model.VMRecord, capabilities []vmCapability) error {
var err error
for index := len(capabilities) - 1; index >= 0; index-- {
hook, ok := capabilities[index].(cleanupCapability)
if !ok {
continue
}
err = joinErr(err, hook.Cleanup(ctx, *vm))
}
return err
}
func (d *Daemon) applyCapabilityConfigChanges(ctx context.Context, before, after model.VMRecord) error {
for _, capability := range d.vmCaps {
if hook, ok := capability.(configChangeCapability); ok {
if err := hook.ApplyConfigChange(ctx, before, after); err != nil {
return err
}
}
}
return nil
}
func (d *Daemon) addCapabilityDoctorChecks(ctx context.Context, report *system.Report) {
for _, capability := range d.vmCaps {
if hook, ok := capability.(doctorCapability); ok {
hook.AddDoctorChecks(ctx, report)
}
}
}
// workDiskCapability provisions a per-VM work disk (image-seeded or
// freshly formatted) and syncs host-side authorised keys + git
// identity + file_sync entries onto it. Holds pointers to the VM and
// workspace services because PrepareHost orchestrates across both,
// plus the store + default image name for its doctor check.
type workDiskCapability struct {
vm *VMService
ws *WorkspaceService
store *store.Store
defaultImageName string
}
func newWorkDiskCapability(vm *VMService, ws *WorkspaceService, st *store.Store, defaultImageName string) workDiskCapability {
return workDiskCapability{
vm: vm,
ws: ws,
store: st,
defaultImageName: defaultImageName,
}
}
func (workDiskCapability) Name() string { return "work-disk" }
func (workDiskCapability) AddStartPreflight(_ context.Context, checks *system.Preflight, vm model.VMRecord, image model.Image) {
if exists(vm.Runtime.WorkDiskPath) {
return
}
imageSeed := ""
if image.RootfsPath != "" {
imageSeed = image.WorkSeedPath
}
if exists(imageSeed) {
if info, err := os.Stat(imageSeed); err == nil && vm.Spec.WorkDiskSizeBytes > info.Size() {
checks.RequireCommand("e2fsck", toolHint("e2fsck"))
checks.RequireCommand("resize2fs", toolHint("resize2fs"))
}
return
}
for _, command := range []string{"mkfs.ext4", "mount", "umount", "cp"} {
checks.RequireCommand(command, toolHint(command))
}
}
func (workDiskCapability) ContributeGuest(builder *guestconfig.Builder, _ model.VMRecord, _ model.Image) {
builder.AddMount(guestconfig.MountSpec{
Source: "/dev/vdb",
Target: "/root",
FSType: "ext4",
Options: []string{"defaults"},
Dump: 0,
Pass: 2,
})
}
func (workDiskCapability) ContributeMachine(cfg *firecracker.MachineConfig, vm model.VMRecord, _ model.Image) {
cfg.Drives = append(cfg.Drives, firecracker.DriveConfig{
ID: "work",
Path: vm.Runtime.WorkDiskPath,
ReadOnly: false,
})
}
func (c workDiskCapability) PrepareHost(ctx context.Context, vm *model.VMRecord, image model.Image) error {
prep, err := c.vm.ensureWorkDisk(ctx, vm, image)
if err != nil {
return err
}
if err := c.ws.ensureAuthorizedKeyOnWorkDisk(ctx, vm, image, prep); err != nil {
return err
}
if err := c.ws.ensureGitIdentityOnWorkDisk(ctx, vm); err != nil {
return err
}
return c.ws.runFileSync(ctx, vm)
}
func (c workDiskCapability) AddDoctorChecks(_ context.Context, report *system.Report) {
if c.store != nil && strings.TrimSpace(c.defaultImageName) != "" {
if image, err := c.store.GetImageByName(context.Background(), c.defaultImageName); err == nil && strings.TrimSpace(image.WorkSeedPath) != "" && exists(image.WorkSeedPath) {
checks := system.NewPreflight()
checks.RequireFile(image.WorkSeedPath, "default image work-seed", `rebuild the default image to regenerate the /root seed`)
report.AddPreflight("feature /root work disk", checks, "seeded /root work disk artifact available")
return
}
}
checks := system.NewPreflight()
for _, command := range []string{"truncate", "mkfs.ext4"} {
checks.RequireCommand(command, toolHint(command))
}
report.AddPreflight("feature /root work disk", checks, "fallback /root work disk tooling available")
report.AddWarn("feature /root work disk", "default image has no work-seed artifact; guest /root will be empty until the image is rebuilt")
}
// dnsCapability publishes + removes <vm>.vm records on the in-process
// DNS server. Only needs HostNetwork.
type dnsCapability struct {
net *HostNetwork
}
func newDNSCapability(net *HostNetwork) dnsCapability {
return dnsCapability{net: net}
}
func (dnsCapability) Name() string { return "dns" }
func (c dnsCapability) PostStart(ctx context.Context, vm model.VMRecord, _ model.Image) error {
return c.net.setDNS(ctx, vm.Name, vm.Runtime.GuestIP)
}
func (c dnsCapability) Cleanup(_ context.Context, vm model.VMRecord) error {
return c.net.removeDNS(vm.Runtime.DNSName)
}
func (dnsCapability) AddDoctorChecks(_ context.Context, report *system.Report) {
conn, err := net.ListenPacket("udp", vmdns.DefaultListenAddr)
if err != nil {
if strings.Contains(strings.ToLower(err.Error()), "address already in use") {
report.AddWarn("feature vm dns", "listener address "+vmdns.DefaultListenAddr+" is already in use")
return
}
report.AddFail("feature vm dns", "cannot bind "+vmdns.DefaultListenAddr+": "+err.Error())
return
}
_ = conn.Close()
report.AddPass("feature vm dns", "listener can bind "+vmdns.DefaultListenAddr)
}
// natCapability sets up host-side NAT so guest traffic can reach the
// outside world. Needs VMService (tap lookup + aliveness) and
// HostNetwork (NAT rules), plus the daemon logger for the cleanup
// short-circuit note.
type natCapability struct {
vm *VMService
net *HostNetwork
logger *slog.Logger
}
func newNATCapability(vm *VMService, net *HostNetwork, logger *slog.Logger) natCapability {
return natCapability{vm: vm, net: net, logger: logger}
}
func (natCapability) Name() string { return "nat" }
func (c natCapability) AddStartPreflight(ctx context.Context, checks *system.Preflight, vm model.VMRecord, _ model.Image) {
if !vm.Spec.NATEnabled {
return
}
c.net.addNATPrereqs(ctx, checks)
}
func (c natCapability) PostStart(ctx context.Context, vm model.VMRecord, _ model.Image) error {
if !vm.Spec.NATEnabled {
return nil
}
return c.net.ensureNAT(ctx, vm.Runtime.GuestIP, c.vm.vmHandles(vm.ID).TapDevice, true)
}
func (c natCapability) Cleanup(ctx context.Context, vm model.VMRecord) error {
if !vm.Spec.NATEnabled {
return nil
}
// Handle cache is volatile across daemon restarts; Runtime is
// the persisted DB-backed copy. Fall back so a crash / corrupt
// handles.json doesn't leak iptables rules keyed off the tap.
tap := strings.TrimSpace(c.vm.vmHandles(vm.ID).TapDevice)
if tap == "" {
tap = strings.TrimSpace(vm.Runtime.TapDevice)
}
if strings.TrimSpace(vm.Runtime.GuestIP) == "" || tap == "" {
if c.logger != nil {
c.logger.Debug("skipping nat cleanup without runtime network handles", append(vmLogAttrs(vm), "guest_ip", vm.Runtime.GuestIP, "tap_device", tap)...)
}
return nil
}
return c.net.ensureNAT(ctx, vm.Runtime.GuestIP, tap, false)
}
func (c natCapability) ApplyConfigChange(ctx context.Context, before, after model.VMRecord) error {
if before.Spec.NATEnabled == after.Spec.NATEnabled {
return nil
}
if !c.vm.vmAlive(after) {
return nil
}
return c.net.ensureNAT(ctx, after.Runtime.GuestIP, c.vm.vmHandles(after.ID).TapDevice, after.Spec.NATEnabled)
}
func (c natCapability) AddDoctorChecks(ctx context.Context, report *system.Report) {
checks := system.NewPreflight()
checks.RequireCommand("ip", toolHint("ip"))
c.net.addNATPrereqs(ctx, checks)
if len(checks.Problems()) > 0 {
report.Add(system.CheckStatusFail, "feature nat", checks.Problems()...)
return
}
uplink, err := c.net.defaultUplink(ctx)
if err != nil {
report.AddFail("feature nat", err.Error())
return
}
report.AddPass("feature nat", "iptables/sysctl available, uplink "+uplink)
}
func joinErr(current, next error) error {
return errors.Join(current, next)
}