banger/internal/daemon/vm.go
Thales Maciel 787b234029
Fix VM startup regressions after shell-out cleanup
The shell-out reduction pass introduced two linked startup regressions in the hot path for vm create.

Make flattenNestedWorkHome repair the temporary nested /root tree without trying to read a root-owned 0700 directory as the calling user: chmod the scratch directory under sudo, then copy each child entry individually before removing it. Add a regression test for that overlap/permission case.

Restore the Firecracker launch wrapper that sets umask 000 before exec. Firecracker was creating the API socket, but the SDK could not use it during machine.Start after the direct sudo launch, so vm create timed out waiting on a socket that already existed.

Validated with go test ./... and make build.
2026-03-18 12:18:34 -03:00

947 lines
28 KiB
Go

package daemon
import (
"context"
"errors"
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
"time"
"banger/internal/api"
"banger/internal/firecracker"
"banger/internal/model"
"banger/internal/paths"
"banger/internal/system"
"banger/internal/vmdns"
)
func (d *Daemon) CreateVM(ctx context.Context, params api.VMCreateParams) (vm model.VMRecord, err error) {
d.mu.Lock()
defer d.mu.Unlock()
op := d.beginOperation("vm.create")
defer func() {
if err != nil {
op.fail(err)
return
}
op.done(vmLogAttrs(vm)...)
}()
if err := validateOptionalPositiveSetting("vcpu", params.VCPUCount); err != nil {
return model.VMRecord{}, err
}
if err := validateOptionalPositiveSetting("memory", params.MemoryMiB); err != nil {
return model.VMRecord{}, err
}
imageName := params.ImageName
if imageName == "" {
imageName = d.config.DefaultImageName
}
image, err := d.FindImage(ctx, imageName)
if err != nil {
return model.VMRecord{}, err
}
op.stage("image_resolved", imageLogAttrs(image)...)
name := strings.TrimSpace(params.Name)
if name == "" {
name, err = d.generateName(ctx)
if err != nil {
return model.VMRecord{}, err
}
}
if _, err := d.FindVM(ctx, name); err == nil {
return model.VMRecord{}, fmt.Errorf("vm name already exists: %s", name)
}
id, err := model.NewID()
if err != nil {
return model.VMRecord{}, err
}
guestIP, err := d.store.NextGuestIP(ctx, bridgePrefix(d.config.BridgeIP))
if err != nil {
return model.VMRecord{}, err
}
vmDir := filepath.Join(d.layout.VMsDir, id)
if err := os.MkdirAll(vmDir, 0o755); err != nil {
return model.VMRecord{}, err
}
systemOverlaySize := int64(model.DefaultSystemOverlaySize)
if params.SystemOverlaySize != "" {
systemOverlaySize, err = model.ParseSize(params.SystemOverlaySize)
if err != nil {
return model.VMRecord{}, err
}
}
workDiskSize := int64(model.DefaultWorkDiskSize)
if params.WorkDiskSize != "" {
workDiskSize, err = model.ParseSize(params.WorkDiskSize)
if err != nil {
return model.VMRecord{}, err
}
}
now := model.Now()
spec := model.VMSpec{
VCPUCount: optionalIntOrDefault(params.VCPUCount, model.DefaultVCPUCount),
MemoryMiB: optionalIntOrDefault(params.MemoryMiB, model.DefaultMemoryMiB),
SystemOverlaySizeByte: systemOverlaySize,
WorkDiskSizeBytes: workDiskSize,
NATEnabled: params.NATEnabled,
}
vm = model.VMRecord{
ID: id,
Name: name,
ImageID: image.ID,
State: model.VMStateCreated,
CreatedAt: now,
UpdatedAt: now,
LastTouchedAt: now,
Spec: spec,
Runtime: model.VMRuntime{
State: model.VMStateCreated,
GuestIP: guestIP,
DNSName: vmdns.RecordName(name),
VMDir: vmDir,
SystemOverlay: filepath.Join(vmDir, "system.cow"),
WorkDiskPath: filepath.Join(vmDir, "root.ext4"),
LogPath: filepath.Join(vmDir, "firecracker.log"),
MetricsPath: filepath.Join(vmDir, "metrics.json"),
},
}
if err := d.store.UpsertVM(ctx, vm); err != nil {
return model.VMRecord{}, err
}
op.stage("persisted", vmLogAttrs(vm)...)
if params.NoStart {
vm.State = model.VMStateStopped
vm.Runtime.State = model.VMStateStopped
if err := d.store.UpsertVM(ctx, vm); err != nil {
return model.VMRecord{}, err
}
return vm, nil
}
return d.startVMLocked(ctx, vm, image)
}
func (d *Daemon) StartVM(ctx context.Context, idOrName string) (model.VMRecord, error) {
d.mu.Lock()
defer d.mu.Unlock()
vm, err := d.FindVM(ctx, idOrName)
if err != nil {
return model.VMRecord{}, err
}
image, err := d.store.GetImageByID(ctx, vm.ImageID)
if err != nil {
return model.VMRecord{}, err
}
if vm.State == model.VMStateRunning && system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
if d.logger != nil {
d.logger.Info("vm already running", vmLogAttrs(vm)...)
}
return vm, nil
}
return d.startVMLocked(ctx, vm, image)
}
func (d *Daemon) startVMLocked(ctx context.Context, vm model.VMRecord, image model.Image) (_ model.VMRecord, err error) {
op := d.beginOperation("vm.start", append(vmLogAttrs(vm), imageLogAttrs(image)...)...)
defer func() {
if err != nil {
err = annotateLogPath(err, vm.Runtime.LogPath)
op.fail(err, vmLogAttrs(vm)...)
return
}
op.done(vmLogAttrs(vm)...)
}()
op.stage("preflight")
if err := d.validateStartPrereqs(ctx, vm, image); err != nil {
return model.VMRecord{}, err
}
if err := os.MkdirAll(vm.Runtime.VMDir, 0o755); err != nil {
return model.VMRecord{}, err
}
op.stage("cleanup_runtime")
if err := d.cleanupRuntime(ctx, vm, true); err != nil {
return model.VMRecord{}, err
}
clearRuntimeHandles(&vm)
op.stage("bridge")
if err := d.ensureBridge(ctx); err != nil {
return model.VMRecord{}, err
}
op.stage("socket_dir")
if err := d.ensureSocketDir(); err != nil {
return model.VMRecord{}, err
}
shortID := system.ShortID(vm.ID)
apiSock := filepath.Join(d.layout.RuntimeDir, "fc-"+shortID+".sock")
tap := "tap-fc-" + shortID
dmName := "fc-rootfs-" + shortID
if err := os.RemoveAll(apiSock); err != nil && !os.IsNotExist(err) {
return model.VMRecord{}, err
}
op.stage("system_overlay", "overlay_path", vm.Runtime.SystemOverlay)
if err := d.ensureSystemOverlay(ctx, &vm); err != nil {
return model.VMRecord{}, err
}
op.stage("dm_snapshot", "dm_name", dmName)
handles, err := d.createDMSnapshot(ctx, image.RootfsPath, vm.Runtime.SystemOverlay, dmName)
if err != nil {
return model.VMRecord{}, err
}
vm.Runtime.BaseLoop = handles.BaseLoop
vm.Runtime.COWLoop = handles.COWLoop
vm.Runtime.DMName = handles.DMName
vm.Runtime.DMDev = handles.DMDev
vm.Runtime.APISockPath = apiSock
vm.Runtime.TapDevice = tap
vm.Runtime.State = model.VMStateRunning
vm.State = model.VMStateRunning
vm.Runtime.LastError = ""
cleanupOnErr := func(err error) (model.VMRecord, error) {
vm.State = model.VMStateError
vm.Runtime.State = model.VMStateError
vm.Runtime.LastError = err.Error()
op.stage("cleanup_after_failure", "error", err.Error())
if cleanupErr := d.cleanupRuntime(context.Background(), vm, true); cleanupErr != nil {
err = errors.Join(err, cleanupErr)
}
clearRuntimeHandles(&vm)
_ = d.store.UpsertVM(context.Background(), vm)
return model.VMRecord{}, err
}
op.stage("patch_root_overlay")
if err := d.patchRootOverlay(ctx, vm, image); err != nil {
return cleanupOnErr(err)
}
op.stage("work_disk", "work_disk_path", vm.Runtime.WorkDiskPath)
if err := d.ensureWorkDisk(ctx, &vm); err != nil {
return cleanupOnErr(err)
}
op.stage("tap", "tap_device", tap)
if err := d.createTap(ctx, tap); err != nil {
return cleanupOnErr(err)
}
op.stage("metrics_file", "metrics_path", vm.Runtime.MetricsPath)
if err := os.WriteFile(vm.Runtime.MetricsPath, nil, 0o644); err != nil {
return cleanupOnErr(err)
}
op.stage("firecracker_binary")
fcPath, err := d.firecrackerBinary()
if err != nil {
return cleanupOnErr(err)
}
op.stage("firecracker_launch", "log_path", vm.Runtime.LogPath, "metrics_path", vm.Runtime.MetricsPath)
firecrackerCtx := context.Background()
machine, err := firecracker.NewMachine(firecrackerCtx, firecracker.MachineConfig{
BinaryPath: fcPath,
VMID: vm.ID,
SocketPath: apiSock,
LogPath: vm.Runtime.LogPath,
MetricsPath: vm.Runtime.MetricsPath,
KernelImagePath: image.KernelPath,
InitrdPath: image.InitrdPath,
KernelArgs: system.BuildBootArgs(vm.Name, vm.Runtime.GuestIP, d.config.BridgeIP, d.config.DefaultDNS),
RootDrivePath: vm.Runtime.DMDev,
WorkDrivePath: vm.Runtime.WorkDiskPath,
TapDevice: tap,
VCPUCount: vm.Spec.VCPUCount,
MemoryMiB: vm.Spec.MemoryMiB,
Logger: d.logger,
})
if err != nil {
return cleanupOnErr(err)
}
if err := machine.Start(firecrackerCtx); err != nil {
vm.Runtime.PID = d.resolveFirecrackerPID(firecrackerCtx, machine, apiSock)
return cleanupOnErr(err)
}
vm.Runtime.PID = d.resolveFirecrackerPID(firecrackerCtx, machine, apiSock)
op.debugStage("firecracker_started", "pid", vm.Runtime.PID)
op.stage("socket_access", "api_socket", apiSock)
if err := d.ensureSocketAccess(ctx, apiSock); err != nil {
return cleanupOnErr(err)
}
op.stage("dns", "dns_name", vm.Runtime.DNSName)
if err := d.setDNS(ctx, vm.Name, vm.Runtime.GuestIP); err != nil {
return cleanupOnErr(err)
}
if vm.Spec.NATEnabled {
op.stage("nat")
if err := d.ensureNAT(ctx, vm, true); err != nil {
return cleanupOnErr(err)
}
}
system.TouchNow(&vm)
op.stage("persist")
if err := d.store.UpsertVM(ctx, vm); err != nil {
return cleanupOnErr(err)
}
return vm, nil
}
func (d *Daemon) StopVM(ctx context.Context, idOrName string) (vm model.VMRecord, err error) {
d.mu.Lock()
defer d.mu.Unlock()
op := d.beginOperation("vm.stop", "vm_ref", idOrName)
defer func() {
if err != nil {
op.fail(err, vmLogAttrs(vm)...)
return
}
op.done(vmLogAttrs(vm)...)
}()
vm, err = d.FindVM(ctx, idOrName)
if err != nil {
return model.VMRecord{}, err
}
if vm.State != model.VMStateRunning || !system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
op.stage("cleanup_stale_runtime")
if err := d.cleanupRuntime(ctx, vm, true); err != nil {
return model.VMRecord{}, err
}
vm.State = model.VMStateStopped
vm.Runtime.State = model.VMStateStopped
clearRuntimeHandles(&vm)
if err := d.store.UpsertVM(ctx, vm); err != nil {
return model.VMRecord{}, err
}
return vm, nil
}
op.stage("graceful_shutdown")
if err := d.sendCtrlAltDel(ctx, vm); err != nil {
return model.VMRecord{}, err
}
op.stage("wait_for_exit", "pid", vm.Runtime.PID)
if err := d.waitForExit(ctx, vm.Runtime.PID, vm.Runtime.APISockPath, 30*time.Second); err != nil {
return model.VMRecord{}, err
}
op.stage("cleanup_runtime")
if err := d.cleanupRuntime(ctx, vm, true); err != nil {
return model.VMRecord{}, err
}
vm.State = model.VMStateStopped
vm.Runtime.State = model.VMStateStopped
clearRuntimeHandles(&vm)
system.TouchNow(&vm)
if err := d.store.UpsertVM(ctx, vm); err != nil {
return model.VMRecord{}, err
}
return vm, nil
}
func (d *Daemon) KillVM(ctx context.Context, params api.VMKillParams) (vm model.VMRecord, err error) {
d.mu.Lock()
defer d.mu.Unlock()
op := d.beginOperation("vm.kill", "vm_ref", params.IDOrName, "signal", params.Signal)
defer func() {
if err != nil {
op.fail(err, vmLogAttrs(vm)...)
return
}
op.done(vmLogAttrs(vm)...)
}()
vm, err = d.FindVM(ctx, params.IDOrName)
if err != nil {
return model.VMRecord{}, err
}
if vm.State != model.VMStateRunning || !system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
op.stage("cleanup_stale_runtime")
if err := d.cleanupRuntime(ctx, vm, true); err != nil {
return model.VMRecord{}, err
}
vm.State = model.VMStateStopped
vm.Runtime.State = model.VMStateStopped
clearRuntimeHandles(&vm)
if err := d.store.UpsertVM(ctx, vm); err != nil {
return model.VMRecord{}, err
}
return vm, nil
}
signal := strings.TrimSpace(params.Signal)
if signal == "" {
signal = "TERM"
}
op.stage("send_signal", "pid", vm.Runtime.PID, "signal", signal)
if _, err := d.runner.RunSudo(ctx, "kill", "-"+signal, strconv.Itoa(vm.Runtime.PID)); err != nil {
return model.VMRecord{}, err
}
op.stage("wait_for_exit", "pid", vm.Runtime.PID)
if err := d.waitForExit(ctx, vm.Runtime.PID, vm.Runtime.APISockPath, 30*time.Second); err != nil {
return model.VMRecord{}, err
}
op.stage("cleanup_runtime")
if err := d.cleanupRuntime(ctx, vm, true); err != nil {
return model.VMRecord{}, err
}
vm.State = model.VMStateStopped
vm.Runtime.State = model.VMStateStopped
clearRuntimeHandles(&vm)
system.TouchNow(&vm)
if err := d.store.UpsertVM(ctx, vm); err != nil {
return model.VMRecord{}, err
}
return vm, nil
}
func (d *Daemon) RestartVM(ctx context.Context, idOrName string) (vm model.VMRecord, err error) {
op := d.beginOperation("vm.restart", "vm_ref", idOrName)
defer func() {
if err != nil {
op.fail(err, vmLogAttrs(vm)...)
return
}
op.done(vmLogAttrs(vm)...)
}()
op.stage("stop")
vm, err = d.StopVM(ctx, idOrName)
if err != nil {
return model.VMRecord{}, err
}
op.stage("start", vmLogAttrs(vm)...)
return d.StartVM(ctx, vm.ID)
}
func (d *Daemon) DeleteVM(ctx context.Context, idOrName string) (vm model.VMRecord, err error) {
d.mu.Lock()
defer d.mu.Unlock()
op := d.beginOperation("vm.delete", "vm_ref", idOrName)
defer func() {
if err != nil {
op.fail(err, vmLogAttrs(vm)...)
return
}
op.done(vmLogAttrs(vm)...)
}()
vm, err = d.FindVM(ctx, idOrName)
if err != nil {
return model.VMRecord{}, err
}
if vm.State == model.VMStateRunning && system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
op.stage("kill_running_vm", "pid", vm.Runtime.PID)
_ = d.killVMProcess(ctx, vm.Runtime.PID)
}
op.stage("cleanup_runtime")
if err := d.cleanupRuntime(ctx, vm, false); err != nil {
return model.VMRecord{}, err
}
if vm.Spec.NATEnabled {
op.debugStage("disable_nat")
_ = d.ensureNAT(ctx, vm, false)
}
op.debugStage("remove_dns", "dns_name", vm.Runtime.DNSName)
_ = d.removeDNS(ctx, vm.Runtime.DNSName)
op.stage("delete_store_record")
if err := d.store.DeleteVM(ctx, vm.ID); err != nil {
return model.VMRecord{}, err
}
if vm.Runtime.VMDir != "" {
op.stage("delete_vm_dir", "vm_dir", vm.Runtime.VMDir)
if err := os.RemoveAll(vm.Runtime.VMDir); err != nil {
return model.VMRecord{}, err
}
}
return vm, nil
}
func (d *Daemon) SetVM(ctx context.Context, params api.VMSetParams) (vm model.VMRecord, err error) {
d.mu.Lock()
defer d.mu.Unlock()
op := d.beginOperation("vm.set", "vm_ref", params.IDOrName)
defer func() {
if err != nil {
op.fail(err, vmLogAttrs(vm)...)
return
}
op.done(vmLogAttrs(vm)...)
}()
vm, err = d.FindVM(ctx, params.IDOrName)
if err != nil {
return model.VMRecord{}, err
}
running := vm.State == model.VMStateRunning && system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath)
if params.VCPUCount != nil {
if err := validateOptionalPositiveSetting("vcpu", params.VCPUCount); err != nil {
return model.VMRecord{}, err
}
if running {
return model.VMRecord{}, errors.New("vcpu changes require the VM to be stopped")
}
op.stage("update_vcpu", "vcpu_count", *params.VCPUCount)
vm.Spec.VCPUCount = *params.VCPUCount
}
if params.MemoryMiB != nil {
if err := validateOptionalPositiveSetting("memory", params.MemoryMiB); err != nil {
return model.VMRecord{}, err
}
if running {
return model.VMRecord{}, errors.New("memory changes require the VM to be stopped")
}
op.stage("update_memory", "memory_mib", *params.MemoryMiB)
vm.Spec.MemoryMiB = *params.MemoryMiB
}
if params.WorkDiskSize != "" {
size, err := model.ParseSize(params.WorkDiskSize)
if err != nil {
return model.VMRecord{}, err
}
if running {
return model.VMRecord{}, errors.New("disk changes require the VM to be stopped")
}
if size < vm.Spec.WorkDiskSizeBytes {
return model.VMRecord{}, errors.New("disk size can only grow")
}
if size > vm.Spec.WorkDiskSizeBytes {
if exists(vm.Runtime.WorkDiskPath) {
op.stage("resize_work_disk", "from_bytes", vm.Spec.WorkDiskSizeBytes, "to_bytes", size)
if err := d.validateWorkDiskResizePrereqs(); err != nil {
return model.VMRecord{}, err
}
if err := system.ResizeExt4Image(ctx, d.runner, vm.Runtime.WorkDiskPath, size); err != nil {
return model.VMRecord{}, err
}
}
vm.Spec.WorkDiskSizeBytes = size
}
}
if params.NATEnabled != nil {
op.stage("update_nat", "nat_enabled", *params.NATEnabled)
vm.Spec.NATEnabled = *params.NATEnabled
if running {
if err := d.ensureNAT(ctx, vm, *params.NATEnabled); err != nil {
return model.VMRecord{}, err
}
}
}
system.TouchNow(&vm)
if err := d.store.UpsertVM(ctx, vm); err != nil {
return model.VMRecord{}, err
}
return vm, nil
}
func (d *Daemon) GetVMStats(ctx context.Context, idOrName string) (model.VMRecord, model.VMStats, error) {
d.mu.Lock()
defer d.mu.Unlock()
vm, err := d.FindVM(ctx, idOrName)
if err != nil {
return model.VMRecord{}, model.VMStats{}, err
}
stats, err := d.collectStats(ctx, vm)
if err == nil {
vm.Stats = stats
vm.UpdatedAt = model.Now()
_ = d.store.UpsertVM(ctx, vm)
if d.logger != nil {
d.logger.Debug("vm stats collected", append(vmLogAttrs(vm), "rss_bytes", stats.RSSBytes, "vsz_bytes", stats.VSZBytes, "cpu_percent", stats.CPUPercent)...)
}
}
return vm, vm.Stats, nil
}
func (d *Daemon) pollStats(ctx context.Context) error {
d.mu.Lock()
defer d.mu.Unlock()
vms, err := d.store.ListVMs(ctx)
if err != nil {
return err
}
for _, vm := range vms {
if vm.State != model.VMStateRunning || !system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
continue
}
stats, err := d.collectStats(ctx, vm)
if err != nil {
if d.logger != nil {
d.logger.Debug("vm stats collection failed", append(vmLogAttrs(vm), "error", err.Error())...)
}
continue
}
vm.Stats = stats
vm.UpdatedAt = model.Now()
_ = d.store.UpsertVM(ctx, vm)
}
return nil
}
func (d *Daemon) stopStaleVMs(ctx context.Context) (err error) {
if d.config.AutoStopStaleAfter <= 0 {
return nil
}
op := d.beginOperation("vm.stop_stale")
defer func() {
if err != nil {
op.fail(err)
return
}
op.done()
}()
d.mu.Lock()
defer d.mu.Unlock()
vms, err := d.store.ListVMs(ctx)
if err != nil {
return err
}
now := model.Now()
for _, vm := range vms {
if vm.State != model.VMStateRunning || !system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
continue
}
if now.Sub(vm.LastTouchedAt) < d.config.AutoStopStaleAfter {
continue
}
op.stage("stopping_vm", vmLogAttrs(vm)...)
_ = d.sendCtrlAltDel(ctx, vm)
_ = d.waitForExit(ctx, vm.Runtime.PID, vm.Runtime.APISockPath, 10*time.Second)
_ = d.cleanupRuntime(ctx, vm, true)
vm.State = model.VMStateStopped
vm.Runtime.State = model.VMStateStopped
clearRuntimeHandles(&vm)
vm.UpdatedAt = model.Now()
_ = d.store.UpsertVM(ctx, vm)
}
return nil
}
func (d *Daemon) collectStats(ctx context.Context, vm model.VMRecord) (model.VMStats, error) {
stats := model.VMStats{
CollectedAt: model.Now(),
SystemOverlayBytes: system.AllocatedBytes(vm.Runtime.SystemOverlay),
WorkDiskBytes: system.AllocatedBytes(vm.Runtime.WorkDiskPath),
MetricsRaw: system.ParseMetricsFile(vm.Runtime.MetricsPath),
}
if vm.Runtime.PID > 0 && system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
ps, err := system.ReadProcessStats(ctx, vm.Runtime.PID)
if err == nil {
stats.CPUPercent = ps.CPUPercent
stats.RSSBytes = ps.RSSBytes
stats.VSZBytes = ps.VSZBytes
}
}
return stats, nil
}
func (d *Daemon) ensureSystemOverlay(ctx context.Context, vm *model.VMRecord) error {
if exists(vm.Runtime.SystemOverlay) {
return nil
}
_, err := d.runner.Run(ctx, "truncate", "-s", strconv.FormatInt(vm.Spec.SystemOverlaySizeByte, 10), vm.Runtime.SystemOverlay)
return err
}
func (d *Daemon) patchRootOverlay(ctx context.Context, vm model.VMRecord, image model.Image) error {
resolv := []byte(fmt.Sprintf("nameserver %s\n", d.config.DefaultDNS))
hostname := []byte(vm.Name + "\n")
hosts := []byte(fmt.Sprintf("127.0.0.1 localhost\n127.0.1.1 %s\n", vm.Name))
sshdConfig := []byte(strings.Join([]string{
"LogLevel DEBUG3",
"PermitRootLogin yes",
"PubkeyAuthentication yes",
"AuthorizedKeysFile /root/.ssh/authorized_keys",
"StrictModes no",
"",
}, "\n"))
fstab, err := system.ReadDebugFSText(ctx, d.runner, vm.Runtime.DMDev, "/etc/fstab")
if err != nil {
fstab = ""
}
newFSTab := system.UpdateFSTab(fstab)
for guestPath, data := range map[string][]byte{
"/etc/resolv.conf": resolv,
"/etc/hostname": hostname,
"/etc/hosts": hosts,
"/etc/fstab": []byte(newFSTab),
"/etc/ssh/sshd_config.d/99-banger.conf": sshdConfig,
} {
if err := system.WriteExt4File(ctx, d.runner, vm.Runtime.DMDev, guestPath, data); err != nil {
return err
}
}
return nil
}
func (d *Daemon) ensureWorkDisk(ctx context.Context, vm *model.VMRecord) error {
if exists(vm.Runtime.WorkDiskPath) {
return nil
}
if _, err := d.runner.Run(ctx, "truncate", "-s", strconv.FormatInt(vm.Spec.WorkDiskSizeBytes, 10), vm.Runtime.WorkDiskPath); err != nil {
return err
}
if _, err := d.runner.Run(ctx, "mkfs.ext4", "-F", vm.Runtime.WorkDiskPath); err != nil {
return err
}
rootMount, cleanupRoot, err := system.MountTempDir(ctx, d.runner, vm.Runtime.DMDev, true)
if err != nil {
return err
}
defer cleanupRoot()
workMount, cleanupWork, err := system.MountTempDir(ctx, d.runner, vm.Runtime.WorkDiskPath, false)
if err != nil {
return err
}
defer cleanupWork()
if err := system.CopyDirContents(ctx, d.runner, filepath.Join(rootMount, "root"), workMount, true); err != nil {
return err
}
if err := d.flattenNestedWorkHome(ctx, workMount); err != nil {
return err
}
return nil
}
func (d *Daemon) flattenNestedWorkHome(ctx context.Context, workMount string) error {
nestedHome := filepath.Join(workMount, "root")
if !exists(nestedHome) {
return nil
}
if _, err := d.runner.RunSudo(ctx, "chmod", "755", nestedHome); err != nil {
return err
}
entries, err := os.ReadDir(nestedHome)
if err != nil {
return err
}
for _, entry := range entries {
sourcePath := filepath.Join(nestedHome, entry.Name())
if _, err := d.runner.RunSudo(ctx, "cp", "-a", sourcePath, workMount+"/"); err != nil {
return err
}
}
_, err = d.runner.RunSudo(ctx, "rm", "-rf", nestedHome)
return err
}
func (d *Daemon) ensureBridge(ctx context.Context) error {
if _, err := d.runner.Run(ctx, "ip", "link", "show", d.config.BridgeName); err == nil {
_, err = d.runner.RunSudo(ctx, "ip", "link", "set", d.config.BridgeName, "up")
return err
}
if _, err := d.runner.RunSudo(ctx, "ip", "link", "add", "name", d.config.BridgeName, "type", "bridge"); err != nil {
return err
}
if _, err := d.runner.RunSudo(ctx, "ip", "addr", "add", fmt.Sprintf("%s/%s", d.config.BridgeIP, d.config.CIDR), "dev", d.config.BridgeName); err != nil {
return err
}
_, err := d.runner.RunSudo(ctx, "ip", "link", "set", d.config.BridgeName, "up")
return err
}
func (d *Daemon) ensureSocketDir() error {
return os.MkdirAll(d.layout.RuntimeDir, 0o755)
}
func (d *Daemon) createTap(ctx context.Context, tap string) error {
if _, err := d.runner.Run(ctx, "ip", "link", "show", tap); err == nil {
_, _ = d.runner.RunSudo(ctx, "ip", "link", "del", tap)
}
if _, err := d.runner.RunSudo(ctx, "ip", "tuntap", "add", "dev", tap, "mode", "tap", "user", strconv.Itoa(os.Getuid()), "group", strconv.Itoa(os.Getgid())); err != nil {
return err
}
if _, err := d.runner.RunSudo(ctx, "ip", "link", "set", tap, "master", d.config.BridgeName); err != nil {
return err
}
if _, err := d.runner.RunSudo(ctx, "ip", "link", "set", tap, "up"); err != nil {
return err
}
_, err := d.runner.RunSudo(ctx, "ip", "link", "set", d.config.BridgeName, "up")
return err
}
func (d *Daemon) firecrackerBinary() (string, error) {
if d.config.FirecrackerBin == "" {
return "", fmt.Errorf("firecracker binary not configured; %s", paths.RuntimeBundleHint())
}
path := d.config.FirecrackerBin
if !exists(path) {
return "", fmt.Errorf("firecracker binary not found at %s; %s", path, paths.RuntimeBundleHint())
}
return path, nil
}
func (d *Daemon) ensureSocketAccess(ctx context.Context, apiSock string) error {
if _, err := d.runner.RunSudo(ctx, "chown", fmt.Sprintf("%d:%d", os.Getuid(), os.Getgid()), apiSock); err != nil {
return err
}
_, err := d.runner.RunSudo(ctx, "chmod", "600", apiSock)
return err
}
func (d *Daemon) findFirecrackerPID(ctx context.Context, apiSock string) (int, error) {
out, err := d.runner.Run(ctx, "pgrep", "-n", "-f", apiSock)
if err != nil {
return 0, err
}
return strconv.Atoi(strings.TrimSpace(string(out)))
}
func (d *Daemon) resolveFirecrackerPID(ctx context.Context, machine *firecracker.Machine, apiSock string) int {
if pid, err := d.findFirecrackerPID(ctx, apiSock); err == nil && pid > 0 {
return pid
}
if machine != nil {
if pid, err := machine.PID(); err == nil && pid > 0 {
return pid
}
}
return 0
}
func (d *Daemon) sendCtrlAltDel(ctx context.Context, vm model.VMRecord) error {
if err := d.ensureSocketAccess(ctx, vm.Runtime.APISockPath); err != nil {
return err
}
client := firecracker.New(vm.Runtime.APISockPath, d.logger)
return client.SendCtrlAltDel(ctx)
}
func (d *Daemon) waitForExit(ctx context.Context, pid int, apiSock string, timeout time.Duration) error {
deadline := time.Now().Add(timeout)
for {
if !system.ProcessRunning(pid, apiSock) {
return nil
}
if time.Now().After(deadline) {
return fmt.Errorf("timed out waiting for VM to exit")
}
select {
case <-ctx.Done():
return ctx.Err()
case <-time.After(100 * time.Millisecond):
}
}
}
func (d *Daemon) cleanupRuntime(ctx context.Context, vm model.VMRecord, preserveDisks bool) error {
if d.logger != nil {
d.logger.Debug("cleanup runtime", append(vmLogAttrs(vm), "preserve_disks", preserveDisks)...)
}
if vm.Runtime.PID > 0 && system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
_ = d.killVMProcess(ctx, vm.Runtime.PID)
if err := d.waitForExit(ctx, vm.Runtime.PID, vm.Runtime.APISockPath, 30*time.Second); err != nil {
return err
}
}
if vm.Runtime.TapDevice != "" {
_, _ = d.runner.RunSudo(ctx, "ip", "link", "del", vm.Runtime.TapDevice)
}
if vm.Runtime.APISockPath != "" {
_ = os.Remove(vm.Runtime.APISockPath)
}
snapshotErr := d.cleanupDMSnapshot(ctx, dmSnapshotHandles{
BaseLoop: vm.Runtime.BaseLoop,
COWLoop: vm.Runtime.COWLoop,
DMName: vm.Runtime.DMName,
DMDev: vm.Runtime.DMDev,
})
if vm.Spec.NATEnabled {
_ = d.ensureNAT(ctx, vm, false)
}
_ = d.removeDNS(ctx, vm.Runtime.DNSName)
if !preserveDisks && vm.Runtime.VMDir != "" {
return errors.Join(snapshotErr, os.RemoveAll(vm.Runtime.VMDir))
}
return snapshotErr
}
func clearRuntimeHandles(vm *model.VMRecord) {
vm.Runtime.PID = 0
vm.Runtime.APISockPath = ""
vm.Runtime.TapDevice = ""
vm.Runtime.BaseLoop = ""
vm.Runtime.COWLoop = ""
vm.Runtime.DMName = ""
vm.Runtime.DMDev = ""
}
func (d *Daemon) setDNS(ctx context.Context, vmName, guestIP string) error {
if d.vmDNS == nil {
return nil
}
return d.vmDNS.Set(vmdns.RecordName(vmName), guestIP)
}
func (d *Daemon) removeDNS(ctx context.Context, dnsName string) error {
if dnsName == "" {
return nil
}
if d.vmDNS == nil {
return nil
}
return d.vmDNS.Remove(dnsName)
}
func (d *Daemon) rebuildDNS(ctx context.Context) error {
if d.vmDNS == nil {
return nil
}
vms, err := d.store.ListVMs(ctx)
if err != nil {
return err
}
records := make(map[string]string)
for _, vm := range vms {
if vm.State != model.VMStateRunning {
continue
}
if !system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
continue
}
if strings.TrimSpace(vm.Runtime.GuestIP) == "" {
continue
}
records[vmdns.RecordName(vm.Name)] = vm.Runtime.GuestIP
}
return d.vmDNS.Replace(records)
}
func (d *Daemon) killVMProcess(ctx context.Context, pid int) error {
_, err := d.runner.RunSudo(ctx, "kill", "-KILL", strconv.Itoa(pid))
return err
}
func (d *Daemon) generateName(ctx context.Context) (string, error) {
if exists(d.config.NamegenPath) {
out, err := d.runner.Run(ctx, d.config.NamegenPath)
if err == nil {
name := strings.TrimSpace(string(out))
if name != "" {
return name, nil
}
}
}
return "vm-" + strconv.FormatInt(time.Now().Unix(), 10), nil
}
func bridgePrefix(bridgeIP string) string {
parts := strings.Split(bridgeIP, ".")
if len(parts) < 3 {
return bridgeIP
}
return strings.Join(parts[:3], ".")
}
func optionalIntOrDefault(value *int, fallback int) int {
if value != nil {
return *value
}
return fallback
}
func validateOptionalPositiveSetting(label string, value *int) error {
if value == nil {
return nil
}
if *value <= 0 {
return fmt.Errorf("%s must be a positive integer", label)
}
return nil
}