Move the supported systemd path to two services: an owner-user bangerd for orchestration and a narrow root helper for bridge/tap, NAT/resolver, dm/loop, and Firecracker ownership. This removes repeated sudo from daily vm and image flows without leaving the general daemon running as root. Add install metadata, system install/status/restart/uninstall commands, and a system-owned runtime layout. Keep user SSH/config material in the owner home, lock file_sync to the owner home, and move daemon known_hosts handling out of the old root-owned control path. Route privileged lifecycle steps through typed privilegedOps calls, harden the two systemd units, and rewrite smoke plus docs around the supported service model. Verified with make build, make test, make lint, and make smoke on the supported systemd host path.
280 lines
8.1 KiB
Go
280 lines
8.1 KiB
Go
package daemon
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"time"
|
|
|
|
"banger/internal/api"
|
|
"banger/internal/model"
|
|
"banger/internal/system"
|
|
)
|
|
|
|
func (s *VMService) StartVM(ctx context.Context, idOrName string) (model.VMRecord, error) {
|
|
return s.withVMLockByRef(ctx, idOrName, func(vm model.VMRecord) (model.VMRecord, error) {
|
|
image, err := s.store.GetImageByID(ctx, vm.ImageID)
|
|
if err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
if s.vmAlive(vm) {
|
|
if s.logger != nil {
|
|
s.logger.Info("vm already running", vmLogAttrs(vm)...)
|
|
}
|
|
return vm, nil
|
|
}
|
|
return s.startVMLocked(ctx, vm, image)
|
|
})
|
|
}
|
|
|
|
func (s *VMService) startVMLocked(ctx context.Context, vm model.VMRecord, image model.Image) (_ model.VMRecord, err error) {
|
|
op := s.beginOperation("vm.start", append(vmLogAttrs(vm), imageLogAttrs(image)...)...)
|
|
defer func() {
|
|
if err != nil {
|
|
err = annotateLogPath(err, vm.Runtime.LogPath)
|
|
op.fail(err, vmLogAttrs(vm)...)
|
|
return
|
|
}
|
|
op.done(vmLogAttrs(vm)...)
|
|
}()
|
|
|
|
// Derive per-VM paths/names up front so every step sees the same
|
|
// values. Shortening vm.ID mirrors how the pre-refactor inline
|
|
// code did it.
|
|
shortID := system.ShortID(vm.ID)
|
|
apiSock := filepath.Join(s.layout.RuntimeDir, "fc-"+shortID+".sock")
|
|
dmName := "fc-rootfs-" + shortID
|
|
tapName := "tap-fc-" + shortID
|
|
if strings.TrimSpace(vm.Runtime.VSockPath) == "" {
|
|
vm.Runtime.VSockPath = defaultVSockPath(s.layout.RuntimeDir, vm.ID)
|
|
}
|
|
if vm.Runtime.VSockCID == 0 {
|
|
vm.Runtime.VSockCID, err = defaultVSockCID(vm.Runtime.GuestIP)
|
|
if err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
}
|
|
|
|
live := model.VMHandles{}
|
|
sc := &startContext{
|
|
vm: &vm,
|
|
image: image,
|
|
live: &live,
|
|
apiSock: apiSock,
|
|
dmName: dmName,
|
|
tapName: tapName,
|
|
}
|
|
|
|
if runErr := s.runStartSteps(ctx, op, sc, s.buildStartSteps(op, sc)); runErr != nil {
|
|
// The step driver already ran rollback in reverse for every
|
|
// succeeded step. All that's left is to persist the ERROR
|
|
// state so operators see the failure via `vm show`. Use a
|
|
// fresh context in case the request ctx is cancelled — DB
|
|
// writes past this point are recovery, not user-driven.
|
|
//
|
|
// The store check is for tests that construct a bare Daemon
|
|
// without a DB; production always has s.store non-nil.
|
|
vm.State = model.VMStateError
|
|
vm.Runtime.State = model.VMStateError
|
|
vm.Runtime.LastError = runErr.Error()
|
|
clearRuntimeTeardownState(&vm)
|
|
s.clearVMHandles(vm)
|
|
if s.store != nil {
|
|
_ = s.store.UpsertVM(context.Background(), vm)
|
|
}
|
|
return model.VMRecord{}, runErr
|
|
}
|
|
return vm, nil
|
|
}
|
|
|
|
func (s *VMService) StopVM(ctx context.Context, idOrName string) (model.VMRecord, error) {
|
|
return s.withVMLockByRef(ctx, idOrName, func(vm model.VMRecord) (model.VMRecord, error) {
|
|
return s.stopVMLocked(ctx, vm)
|
|
})
|
|
}
|
|
|
|
func (s *VMService) stopVMLocked(ctx context.Context, current model.VMRecord) (vm model.VMRecord, err error) {
|
|
vm = current
|
|
op := s.beginOperation("vm.stop", "vm_ref", vm.ID)
|
|
defer func() {
|
|
if err != nil {
|
|
op.fail(err, vmLogAttrs(vm)...)
|
|
return
|
|
}
|
|
op.done(vmLogAttrs(vm)...)
|
|
}()
|
|
if !s.vmAlive(vm) {
|
|
op.stage("cleanup_stale_runtime")
|
|
if err := s.cleanupRuntime(ctx, vm, true); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
vm.State = model.VMStateStopped
|
|
vm.Runtime.State = model.VMStateStopped
|
|
clearRuntimeTeardownState(&vm)
|
|
s.clearVMHandles(vm)
|
|
if err := s.store.UpsertVM(ctx, vm); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
return vm, nil
|
|
}
|
|
pid := s.vmHandles(vm.ID).PID
|
|
op.stage("graceful_shutdown")
|
|
if err := s.net.sendCtrlAltDel(ctx, vm.Runtime.APISockPath); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
op.stage("wait_for_exit", "pid", pid)
|
|
if err := s.net.waitForExit(ctx, pid, vm.Runtime.APISockPath, gracefulShutdownWait); err != nil {
|
|
if !errors.Is(err, errWaitForExitTimeout) {
|
|
return model.VMRecord{}, err
|
|
}
|
|
op.stage("graceful_shutdown_timeout", "pid", pid)
|
|
}
|
|
op.stage("cleanup_runtime")
|
|
if err := s.cleanupRuntime(ctx, vm, true); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
vm.State = model.VMStateStopped
|
|
vm.Runtime.State = model.VMStateStopped
|
|
clearRuntimeTeardownState(&vm)
|
|
s.clearVMHandles(vm)
|
|
system.TouchNow(&vm)
|
|
if err := s.store.UpsertVM(ctx, vm); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
return vm, nil
|
|
}
|
|
|
|
func (s *VMService) KillVM(ctx context.Context, params api.VMKillParams) (model.VMRecord, error) {
|
|
return s.withVMLockByRef(ctx, params.IDOrName, func(vm model.VMRecord) (model.VMRecord, error) {
|
|
return s.killVMLocked(ctx, vm, params.Signal)
|
|
})
|
|
}
|
|
|
|
func (s *VMService) killVMLocked(ctx context.Context, current model.VMRecord, signalValue string) (vm model.VMRecord, err error) {
|
|
vm = current
|
|
op := s.beginOperation("vm.kill", "vm_ref", vm.ID, "signal", signalValue)
|
|
defer func() {
|
|
if err != nil {
|
|
op.fail(err, vmLogAttrs(vm)...)
|
|
return
|
|
}
|
|
op.done(vmLogAttrs(vm)...)
|
|
}()
|
|
if !s.vmAlive(vm) {
|
|
op.stage("cleanup_stale_runtime")
|
|
if err := s.cleanupRuntime(ctx, vm, true); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
vm.State = model.VMStateStopped
|
|
vm.Runtime.State = model.VMStateStopped
|
|
clearRuntimeTeardownState(&vm)
|
|
s.clearVMHandles(vm)
|
|
if err := s.store.UpsertVM(ctx, vm); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
return vm, nil
|
|
}
|
|
|
|
signal := strings.TrimSpace(signalValue)
|
|
if signal == "" {
|
|
signal = "TERM"
|
|
}
|
|
pid := s.vmHandles(vm.ID).PID
|
|
op.stage("send_signal", "pid", pid, "signal", signal)
|
|
if err := s.privOps().SignalProcess(ctx, pid, signal); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
op.stage("wait_for_exit", "pid", pid)
|
|
if err := s.net.waitForExit(ctx, pid, vm.Runtime.APISockPath, 30*time.Second); err != nil {
|
|
if !errors.Is(err, errWaitForExitTimeout) {
|
|
return model.VMRecord{}, err
|
|
}
|
|
op.stage("signal_timeout", "pid", pid, "signal", signal)
|
|
}
|
|
op.stage("cleanup_runtime")
|
|
if err := s.cleanupRuntime(ctx, vm, true); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
vm.State = model.VMStateStopped
|
|
vm.Runtime.State = model.VMStateStopped
|
|
clearRuntimeTeardownState(&vm)
|
|
s.clearVMHandles(vm)
|
|
system.TouchNow(&vm)
|
|
if err := s.store.UpsertVM(ctx, vm); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
return vm, nil
|
|
}
|
|
|
|
func (s *VMService) RestartVM(ctx context.Context, idOrName string) (vm model.VMRecord, err error) {
|
|
op := s.beginOperation("vm.restart", "vm_ref", idOrName)
|
|
defer func() {
|
|
if err != nil {
|
|
op.fail(err, vmLogAttrs(vm)...)
|
|
return
|
|
}
|
|
op.done(vmLogAttrs(vm)...)
|
|
}()
|
|
resolved, err := s.FindVM(ctx, idOrName)
|
|
if err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
return s.withVMLockByID(ctx, resolved.ID, func(vm model.VMRecord) (model.VMRecord, error) {
|
|
op.stage("stop")
|
|
vm, err = s.stopVMLocked(ctx, vm)
|
|
if err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
image, err := s.store.GetImageByID(ctx, vm.ImageID)
|
|
if err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
op.stage("start", vmLogAttrs(vm)...)
|
|
return s.startVMLocked(ctx, vm, image)
|
|
})
|
|
}
|
|
|
|
func (s *VMService) DeleteVM(ctx context.Context, idOrName string) (model.VMRecord, error) {
|
|
return s.withVMLockByRef(ctx, idOrName, func(vm model.VMRecord) (model.VMRecord, error) {
|
|
return s.deleteVMLocked(ctx, vm)
|
|
})
|
|
}
|
|
|
|
func (s *VMService) deleteVMLocked(ctx context.Context, current model.VMRecord) (vm model.VMRecord, err error) {
|
|
vm = current
|
|
op := s.beginOperation("vm.delete", "vm_ref", vm.ID)
|
|
defer func() {
|
|
if err != nil {
|
|
op.fail(err, vmLogAttrs(vm)...)
|
|
return
|
|
}
|
|
op.done(vmLogAttrs(vm)...)
|
|
}()
|
|
if s.vmAlive(vm) {
|
|
pid := s.vmHandles(vm.ID).PID
|
|
op.stage("kill_running_vm", "pid", pid)
|
|
_ = s.net.killVMProcess(ctx, pid)
|
|
}
|
|
op.stage("cleanup_runtime")
|
|
if err := s.cleanupRuntime(ctx, vm, false); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
clearRuntimeTeardownState(&vm)
|
|
op.stage("delete_store_record")
|
|
if err := s.store.DeleteVM(ctx, vm.ID); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
if vm.Runtime.VMDir != "" {
|
|
op.stage("delete_vm_dir", "vm_dir", vm.Runtime.VMDir)
|
|
if err := os.RemoveAll(vm.Runtime.VMDir); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
}
|
|
// Drop any host-key pins. A future VM reusing this IP or name
|
|
// would otherwise trip the TOFU mismatch branch in
|
|
// TOFUHostKeyCallback and fail to connect.
|
|
removeVMKnownHosts(s.layout.KnownHostsPath, vm, s.logger)
|
|
return vm, nil
|
|
}
|