Three recovery-path errors were silently dropped: - vm_lifecycle.go startVMLocked persisted the VMStateError record with `_ = s.store.UpsertVM(...)`. If the persist failed the user saw the original start error but operators had no way to find out the store had also drifted out of sync. - vm_lifecycle.go deleteVMLocked killed the firecracker process with `_ = s.net.killVMProcess(...)`. cleanupRuntime tears it down regardless, so the explicit kill is best-effort, but a permission-denied / EPERM was still worth logging. - capabilities.go cleanupPreparedCapabilities collected per-cap errors with errors.Join. Callers get the aggregated value but couldn't tell which capability failed when more than one did. All three now log Warn before the original behaviour continues. The aggregate return value, control flow, and user-visible error strings are unchanged — this is purely a "less silence in the journal" pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
296 lines
9.1 KiB
Go
296 lines
9.1 KiB
Go
package daemon
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"time"
|
|
|
|
"banger/internal/api"
|
|
"banger/internal/model"
|
|
"banger/internal/system"
|
|
)
|
|
|
|
func (s *VMService) StartVM(ctx context.Context, idOrName string) (model.VMRecord, error) {
|
|
return s.withVMLockByRef(ctx, idOrName, func(vm model.VMRecord) (model.VMRecord, error) {
|
|
image, err := s.store.GetImageByID(ctx, vm.ImageID)
|
|
if err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
if s.vmAlive(vm) {
|
|
if s.logger != nil {
|
|
s.logger.Info("vm already running", vmLogAttrs(vm)...)
|
|
}
|
|
return vm, nil
|
|
}
|
|
return s.startVMLocked(ctx, vm, image)
|
|
})
|
|
}
|
|
|
|
func (s *VMService) startVMLocked(ctx context.Context, vm model.VMRecord, image model.Image) (_ model.VMRecord, err error) {
|
|
op := s.beginOperation(ctx, "vm.start", append(vmLogAttrs(vm), imageLogAttrs(image)...)...)
|
|
defer func() {
|
|
if err != nil {
|
|
err = annotateLogPath(err, vm.Runtime.LogPath)
|
|
op.fail(err, vmLogAttrs(vm)...)
|
|
return
|
|
}
|
|
op.done(vmLogAttrs(vm)...)
|
|
}()
|
|
|
|
// Derive per-VM paths/names up front so every step sees the same
|
|
// values. Shortening vm.ID mirrors how the pre-refactor inline
|
|
// code did it.
|
|
shortID := system.ShortID(vm.ID)
|
|
apiSock := filepath.Join(s.layout.RuntimeDir, "fc-"+shortID+".sock")
|
|
dmName := "fc-rootfs-" + shortID
|
|
tapName := "tap-fc-" + shortID
|
|
if strings.TrimSpace(vm.Runtime.VSockPath) == "" {
|
|
vm.Runtime.VSockPath = defaultVSockPath(s.layout.RuntimeDir, vm.ID)
|
|
}
|
|
if vm.Runtime.VSockCID == 0 {
|
|
vm.Runtime.VSockCID, err = defaultVSockCID(vm.Runtime.GuestIP)
|
|
if err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
}
|
|
|
|
live := model.VMHandles{}
|
|
sc := &startContext{
|
|
vm: &vm,
|
|
image: image,
|
|
live: &live,
|
|
apiSock: apiSock,
|
|
dmName: dmName,
|
|
tapName: tapName,
|
|
}
|
|
|
|
if runErr := s.runStartSteps(ctx, op, sc, s.buildStartSteps(op, sc)); runErr != nil {
|
|
// The step driver already ran rollback in reverse for every
|
|
// succeeded step. All that's left is to persist the ERROR
|
|
// state so operators see the failure via `vm show`. Use a
|
|
// fresh context in case the request ctx is cancelled — DB
|
|
// writes past this point are recovery, not user-driven.
|
|
//
|
|
// The store check is for tests that construct a bare Daemon
|
|
// without a DB; production always has s.store non-nil.
|
|
vm.State = model.VMStateError
|
|
vm.Runtime.State = model.VMStateError
|
|
vm.Runtime.LastError = runErr.Error()
|
|
clearRuntimeTeardownState(&vm)
|
|
s.clearVMHandles(vm)
|
|
if s.store != nil {
|
|
// We're in the recovery path: the start has already
|
|
// failed, and the user will see runErr. A persist
|
|
// failure here only affects what 'banger vm show'
|
|
// reads on the next call, so we keep returning runErr
|
|
// — but a silent swallow leaves operators unable to
|
|
// debug "why does the record still say running?". Log
|
|
// at warn instead.
|
|
if persistErr := s.store.UpsertVM(context.Background(), vm); persistErr != nil && s.logger != nil {
|
|
s.logger.Warn("persist vm error state failed", append(vmLogAttrs(vm), "error", persistErr.Error())...)
|
|
}
|
|
}
|
|
return model.VMRecord{}, runErr
|
|
}
|
|
return vm, nil
|
|
}
|
|
|
|
func (s *VMService) StopVM(ctx context.Context, idOrName string) (model.VMRecord, error) {
|
|
return s.withVMLockByRef(ctx, idOrName, func(vm model.VMRecord) (model.VMRecord, error) {
|
|
return s.stopVMLocked(ctx, vm)
|
|
})
|
|
}
|
|
|
|
func (s *VMService) stopVMLocked(ctx context.Context, current model.VMRecord) (vm model.VMRecord, err error) {
|
|
vm = current
|
|
op := s.beginOperation(ctx, "vm.stop", "vm_ref", vm.ID)
|
|
defer func() {
|
|
if err != nil {
|
|
op.fail(err, vmLogAttrs(vm)...)
|
|
return
|
|
}
|
|
op.done(vmLogAttrs(vm)...)
|
|
}()
|
|
if !s.vmAlive(vm) {
|
|
op.stage("cleanup_stale_runtime")
|
|
if err := s.cleanupRuntime(ctx, vm, true); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
vm.State = model.VMStateStopped
|
|
vm.Runtime.State = model.VMStateStopped
|
|
clearRuntimeTeardownState(&vm)
|
|
s.clearVMHandles(vm)
|
|
if err := s.store.UpsertVM(ctx, vm); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
return vm, nil
|
|
}
|
|
pid := s.vmHandles(vm.ID).PID
|
|
op.stage("graceful_shutdown")
|
|
if err := s.net.sendCtrlAltDel(ctx, vm.Runtime.APISockPath); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
op.stage("wait_for_exit", "pid", pid)
|
|
if err := s.net.waitForExit(ctx, pid, vm.Runtime.APISockPath, gracefulShutdownWait); err != nil {
|
|
if !errors.Is(err, errWaitForExitTimeout) {
|
|
return model.VMRecord{}, err
|
|
}
|
|
op.stage("graceful_shutdown_timeout", "pid", pid)
|
|
}
|
|
op.stage("cleanup_runtime")
|
|
if err := s.cleanupRuntime(ctx, vm, true); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
vm.State = model.VMStateStopped
|
|
vm.Runtime.State = model.VMStateStopped
|
|
clearRuntimeTeardownState(&vm)
|
|
s.clearVMHandles(vm)
|
|
system.TouchNow(&vm)
|
|
if err := s.store.UpsertVM(ctx, vm); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
return vm, nil
|
|
}
|
|
|
|
func (s *VMService) KillVM(ctx context.Context, params api.VMKillParams) (model.VMRecord, error) {
|
|
return s.withVMLockByRef(ctx, params.IDOrName, func(vm model.VMRecord) (model.VMRecord, error) {
|
|
return s.killVMLocked(ctx, vm, params.Signal)
|
|
})
|
|
}
|
|
|
|
func (s *VMService) killVMLocked(ctx context.Context, current model.VMRecord, signalValue string) (vm model.VMRecord, err error) {
|
|
vm = current
|
|
op := s.beginOperation(ctx, "vm.kill", "vm_ref", vm.ID, "signal", signalValue)
|
|
defer func() {
|
|
if err != nil {
|
|
op.fail(err, vmLogAttrs(vm)...)
|
|
return
|
|
}
|
|
op.done(vmLogAttrs(vm)...)
|
|
}()
|
|
if !s.vmAlive(vm) {
|
|
op.stage("cleanup_stale_runtime")
|
|
if err := s.cleanupRuntime(ctx, vm, true); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
vm.State = model.VMStateStopped
|
|
vm.Runtime.State = model.VMStateStopped
|
|
clearRuntimeTeardownState(&vm)
|
|
s.clearVMHandles(vm)
|
|
if err := s.store.UpsertVM(ctx, vm); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
return vm, nil
|
|
}
|
|
|
|
signal := strings.TrimSpace(signalValue)
|
|
if signal == "" {
|
|
signal = "TERM"
|
|
}
|
|
pid := s.vmHandles(vm.ID).PID
|
|
op.stage("send_signal", "pid", pid, "signal", signal)
|
|
if err := s.privOps().SignalProcess(ctx, pid, signal); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
op.stage("wait_for_exit", "pid", pid)
|
|
if err := s.net.waitForExit(ctx, pid, vm.Runtime.APISockPath, 30*time.Second); err != nil {
|
|
if !errors.Is(err, errWaitForExitTimeout) {
|
|
return model.VMRecord{}, err
|
|
}
|
|
op.stage("signal_timeout", "pid", pid, "signal", signal)
|
|
}
|
|
op.stage("cleanup_runtime")
|
|
if err := s.cleanupRuntime(ctx, vm, true); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
vm.State = model.VMStateStopped
|
|
vm.Runtime.State = model.VMStateStopped
|
|
clearRuntimeTeardownState(&vm)
|
|
s.clearVMHandles(vm)
|
|
system.TouchNow(&vm)
|
|
if err := s.store.UpsertVM(ctx, vm); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
return vm, nil
|
|
}
|
|
|
|
func (s *VMService) RestartVM(ctx context.Context, idOrName string) (vm model.VMRecord, err error) {
|
|
op := s.beginOperation(ctx, "vm.restart", "vm_ref", idOrName)
|
|
defer func() {
|
|
if err != nil {
|
|
op.fail(err, vmLogAttrs(vm)...)
|
|
return
|
|
}
|
|
op.done(vmLogAttrs(vm)...)
|
|
}()
|
|
resolved, err := s.FindVM(ctx, idOrName)
|
|
if err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
return s.withVMLockByID(ctx, resolved.ID, func(vm model.VMRecord) (model.VMRecord, error) {
|
|
op.stage("stop")
|
|
vm, err = s.stopVMLocked(ctx, vm)
|
|
if err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
image, err := s.store.GetImageByID(ctx, vm.ImageID)
|
|
if err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
op.stage("start", vmLogAttrs(vm)...)
|
|
return s.startVMLocked(ctx, vm, image)
|
|
})
|
|
}
|
|
|
|
func (s *VMService) DeleteVM(ctx context.Context, idOrName string) (model.VMRecord, error) {
|
|
return s.withVMLockByRef(ctx, idOrName, func(vm model.VMRecord) (model.VMRecord, error) {
|
|
return s.deleteVMLocked(ctx, vm)
|
|
})
|
|
}
|
|
|
|
func (s *VMService) deleteVMLocked(ctx context.Context, current model.VMRecord) (vm model.VMRecord, err error) {
|
|
vm = current
|
|
op := s.beginOperation(ctx, "vm.delete", "vm_ref", vm.ID)
|
|
defer func() {
|
|
if err != nil {
|
|
op.fail(err, vmLogAttrs(vm)...)
|
|
return
|
|
}
|
|
op.done(vmLogAttrs(vm)...)
|
|
}()
|
|
if s.vmAlive(vm) {
|
|
pid := s.vmHandles(vm.ID).PID
|
|
op.stage("kill_running_vm", "pid", pid)
|
|
// Best-effort: cleanupRuntime below tears the process down
|
|
// regardless. A kill failure here only matters when it
|
|
// surfaces something operators should see (permission
|
|
// denied, etc.), so promote it from a silent _ to a Warn
|
|
// without changing the control flow.
|
|
if killErr := s.net.killVMProcess(ctx, pid); killErr != nil && s.logger != nil {
|
|
s.logger.Warn("kill vm process during delete failed", append(vmLogAttrs(vm), "pid", pid, "error", killErr.Error())...)
|
|
}
|
|
}
|
|
op.stage("cleanup_runtime")
|
|
if err := s.cleanupRuntime(ctx, vm, false); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
clearRuntimeTeardownState(&vm)
|
|
op.stage("delete_store_record")
|
|
if err := s.store.DeleteVM(ctx, vm.ID); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
if vm.Runtime.VMDir != "" {
|
|
op.stage("delete_vm_dir", "vm_dir", vm.Runtime.VMDir)
|
|
if err := os.RemoveAll(vm.Runtime.VMDir); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
}
|
|
// Drop any host-key pins. A future VM reusing this IP or name
|
|
// would otherwise trip the TOFU mismatch branch in
|
|
// TOFUHostKeyCallback and fail to connect.
|
|
removeVMKnownHosts(s.layout.KnownHostsPath, vm, s.logger)
|
|
return vm, nil
|
|
}
|