Today there's no way to correlate a CLI failure with a daemon log line. operationLog records relative timing but no id, two concurrent vm.start calls log indistinguishably, and the async vmCreateOperationState.ID is user-facing yet never reaches the journal. The root helper logs plain text to stderr while bangerd logs JSON, so a merged journalctl is hard to grep across the trust-boundary split. Mint a per-RPC op id at dispatch entry, store it on context, and include it as an "op_id" attr on every operationLog record. The id is stamped onto every error response (including the early short-circuit paths bad_version and unknown_method). rpc.Call forwards the context op id on requests so a daemon RPC and the helper RPCs it triggers all share one id. The helper now logs JSON to match bangerd, adopts the inbound id, and emits a single "helper rpc completed" / "helper rpc failed" line per call so operators can see at a glance how long each privileged op took. vmCreateOperationState.ID is now the same id dispatch generated for vm.create.begin — one identifier between client status polls, daemon logs, and helper logs. The wire format gains two optional fields: rpc.Request.OpID and rpc.ErrorResponse.OpID, both omitempty so older peers (and the opposite direction) ignore them. ErrorResponse.Error() now appends "(op-XXXXXX)" to its string form when set; existing callers that just print err.Error() get the id for free. Tests cover: dispatch stamps op_id on unknown_method, bad_version, and handler-returned errors; rpc.Call exposes the typed *ErrorResponse via errors.As so the CLI can read code/op_id; ctx op_id is forwarded to the server in the request envelope. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
280 lines
8.2 KiB
Go
280 lines
8.2 KiB
Go
package daemon
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"time"
|
|
|
|
"banger/internal/api"
|
|
"banger/internal/model"
|
|
"banger/internal/system"
|
|
)
|
|
|
|
func (s *VMService) StartVM(ctx context.Context, idOrName string) (model.VMRecord, error) {
|
|
return s.withVMLockByRef(ctx, idOrName, func(vm model.VMRecord) (model.VMRecord, error) {
|
|
image, err := s.store.GetImageByID(ctx, vm.ImageID)
|
|
if err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
if s.vmAlive(vm) {
|
|
if s.logger != nil {
|
|
s.logger.Info("vm already running", vmLogAttrs(vm)...)
|
|
}
|
|
return vm, nil
|
|
}
|
|
return s.startVMLocked(ctx, vm, image)
|
|
})
|
|
}
|
|
|
|
func (s *VMService) startVMLocked(ctx context.Context, vm model.VMRecord, image model.Image) (_ model.VMRecord, err error) {
|
|
op := s.beginOperation(ctx, "vm.start", append(vmLogAttrs(vm), imageLogAttrs(image)...)...)
|
|
defer func() {
|
|
if err != nil {
|
|
err = annotateLogPath(err, vm.Runtime.LogPath)
|
|
op.fail(err, vmLogAttrs(vm)...)
|
|
return
|
|
}
|
|
op.done(vmLogAttrs(vm)...)
|
|
}()
|
|
|
|
// Derive per-VM paths/names up front so every step sees the same
|
|
// values. Shortening vm.ID mirrors how the pre-refactor inline
|
|
// code did it.
|
|
shortID := system.ShortID(vm.ID)
|
|
apiSock := filepath.Join(s.layout.RuntimeDir, "fc-"+shortID+".sock")
|
|
dmName := "fc-rootfs-" + shortID
|
|
tapName := "tap-fc-" + shortID
|
|
if strings.TrimSpace(vm.Runtime.VSockPath) == "" {
|
|
vm.Runtime.VSockPath = defaultVSockPath(s.layout.RuntimeDir, vm.ID)
|
|
}
|
|
if vm.Runtime.VSockCID == 0 {
|
|
vm.Runtime.VSockCID, err = defaultVSockCID(vm.Runtime.GuestIP)
|
|
if err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
}
|
|
|
|
live := model.VMHandles{}
|
|
sc := &startContext{
|
|
vm: &vm,
|
|
image: image,
|
|
live: &live,
|
|
apiSock: apiSock,
|
|
dmName: dmName,
|
|
tapName: tapName,
|
|
}
|
|
|
|
if runErr := s.runStartSteps(ctx, op, sc, s.buildStartSteps(op, sc)); runErr != nil {
|
|
// The step driver already ran rollback in reverse for every
|
|
// succeeded step. All that's left is to persist the ERROR
|
|
// state so operators see the failure via `vm show`. Use a
|
|
// fresh context in case the request ctx is cancelled — DB
|
|
// writes past this point are recovery, not user-driven.
|
|
//
|
|
// The store check is for tests that construct a bare Daemon
|
|
// without a DB; production always has s.store non-nil.
|
|
vm.State = model.VMStateError
|
|
vm.Runtime.State = model.VMStateError
|
|
vm.Runtime.LastError = runErr.Error()
|
|
clearRuntimeTeardownState(&vm)
|
|
s.clearVMHandles(vm)
|
|
if s.store != nil {
|
|
_ = s.store.UpsertVM(context.Background(), vm)
|
|
}
|
|
return model.VMRecord{}, runErr
|
|
}
|
|
return vm, nil
|
|
}
|
|
|
|
func (s *VMService) StopVM(ctx context.Context, idOrName string) (model.VMRecord, error) {
|
|
return s.withVMLockByRef(ctx, idOrName, func(vm model.VMRecord) (model.VMRecord, error) {
|
|
return s.stopVMLocked(ctx, vm)
|
|
})
|
|
}
|
|
|
|
func (s *VMService) stopVMLocked(ctx context.Context, current model.VMRecord) (vm model.VMRecord, err error) {
|
|
vm = current
|
|
op := s.beginOperation(ctx, "vm.stop", "vm_ref", vm.ID)
|
|
defer func() {
|
|
if err != nil {
|
|
op.fail(err, vmLogAttrs(vm)...)
|
|
return
|
|
}
|
|
op.done(vmLogAttrs(vm)...)
|
|
}()
|
|
if !s.vmAlive(vm) {
|
|
op.stage("cleanup_stale_runtime")
|
|
if err := s.cleanupRuntime(ctx, vm, true); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
vm.State = model.VMStateStopped
|
|
vm.Runtime.State = model.VMStateStopped
|
|
clearRuntimeTeardownState(&vm)
|
|
s.clearVMHandles(vm)
|
|
if err := s.store.UpsertVM(ctx, vm); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
return vm, nil
|
|
}
|
|
pid := s.vmHandles(vm.ID).PID
|
|
op.stage("graceful_shutdown")
|
|
if err := s.net.sendCtrlAltDel(ctx, vm.Runtime.APISockPath); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
op.stage("wait_for_exit", "pid", pid)
|
|
if err := s.net.waitForExit(ctx, pid, vm.Runtime.APISockPath, gracefulShutdownWait); err != nil {
|
|
if !errors.Is(err, errWaitForExitTimeout) {
|
|
return model.VMRecord{}, err
|
|
}
|
|
op.stage("graceful_shutdown_timeout", "pid", pid)
|
|
}
|
|
op.stage("cleanup_runtime")
|
|
if err := s.cleanupRuntime(ctx, vm, true); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
vm.State = model.VMStateStopped
|
|
vm.Runtime.State = model.VMStateStopped
|
|
clearRuntimeTeardownState(&vm)
|
|
s.clearVMHandles(vm)
|
|
system.TouchNow(&vm)
|
|
if err := s.store.UpsertVM(ctx, vm); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
return vm, nil
|
|
}
|
|
|
|
func (s *VMService) KillVM(ctx context.Context, params api.VMKillParams) (model.VMRecord, error) {
|
|
return s.withVMLockByRef(ctx, params.IDOrName, func(vm model.VMRecord) (model.VMRecord, error) {
|
|
return s.killVMLocked(ctx, vm, params.Signal)
|
|
})
|
|
}
|
|
|
|
func (s *VMService) killVMLocked(ctx context.Context, current model.VMRecord, signalValue string) (vm model.VMRecord, err error) {
|
|
vm = current
|
|
op := s.beginOperation(ctx, "vm.kill", "vm_ref", vm.ID, "signal", signalValue)
|
|
defer func() {
|
|
if err != nil {
|
|
op.fail(err, vmLogAttrs(vm)...)
|
|
return
|
|
}
|
|
op.done(vmLogAttrs(vm)...)
|
|
}()
|
|
if !s.vmAlive(vm) {
|
|
op.stage("cleanup_stale_runtime")
|
|
if err := s.cleanupRuntime(ctx, vm, true); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
vm.State = model.VMStateStopped
|
|
vm.Runtime.State = model.VMStateStopped
|
|
clearRuntimeTeardownState(&vm)
|
|
s.clearVMHandles(vm)
|
|
if err := s.store.UpsertVM(ctx, vm); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
return vm, nil
|
|
}
|
|
|
|
signal := strings.TrimSpace(signalValue)
|
|
if signal == "" {
|
|
signal = "TERM"
|
|
}
|
|
pid := s.vmHandles(vm.ID).PID
|
|
op.stage("send_signal", "pid", pid, "signal", signal)
|
|
if err := s.privOps().SignalProcess(ctx, pid, signal); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
op.stage("wait_for_exit", "pid", pid)
|
|
if err := s.net.waitForExit(ctx, pid, vm.Runtime.APISockPath, 30*time.Second); err != nil {
|
|
if !errors.Is(err, errWaitForExitTimeout) {
|
|
return model.VMRecord{}, err
|
|
}
|
|
op.stage("signal_timeout", "pid", pid, "signal", signal)
|
|
}
|
|
op.stage("cleanup_runtime")
|
|
if err := s.cleanupRuntime(ctx, vm, true); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
vm.State = model.VMStateStopped
|
|
vm.Runtime.State = model.VMStateStopped
|
|
clearRuntimeTeardownState(&vm)
|
|
s.clearVMHandles(vm)
|
|
system.TouchNow(&vm)
|
|
if err := s.store.UpsertVM(ctx, vm); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
return vm, nil
|
|
}
|
|
|
|
func (s *VMService) RestartVM(ctx context.Context, idOrName string) (vm model.VMRecord, err error) {
|
|
op := s.beginOperation(ctx, "vm.restart", "vm_ref", idOrName)
|
|
defer func() {
|
|
if err != nil {
|
|
op.fail(err, vmLogAttrs(vm)...)
|
|
return
|
|
}
|
|
op.done(vmLogAttrs(vm)...)
|
|
}()
|
|
resolved, err := s.FindVM(ctx, idOrName)
|
|
if err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
return s.withVMLockByID(ctx, resolved.ID, func(vm model.VMRecord) (model.VMRecord, error) {
|
|
op.stage("stop")
|
|
vm, err = s.stopVMLocked(ctx, vm)
|
|
if err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
image, err := s.store.GetImageByID(ctx, vm.ImageID)
|
|
if err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
op.stage("start", vmLogAttrs(vm)...)
|
|
return s.startVMLocked(ctx, vm, image)
|
|
})
|
|
}
|
|
|
|
func (s *VMService) DeleteVM(ctx context.Context, idOrName string) (model.VMRecord, error) {
|
|
return s.withVMLockByRef(ctx, idOrName, func(vm model.VMRecord) (model.VMRecord, error) {
|
|
return s.deleteVMLocked(ctx, vm)
|
|
})
|
|
}
|
|
|
|
func (s *VMService) deleteVMLocked(ctx context.Context, current model.VMRecord) (vm model.VMRecord, err error) {
|
|
vm = current
|
|
op := s.beginOperation(ctx, "vm.delete", "vm_ref", vm.ID)
|
|
defer func() {
|
|
if err != nil {
|
|
op.fail(err, vmLogAttrs(vm)...)
|
|
return
|
|
}
|
|
op.done(vmLogAttrs(vm)...)
|
|
}()
|
|
if s.vmAlive(vm) {
|
|
pid := s.vmHandles(vm.ID).PID
|
|
op.stage("kill_running_vm", "pid", pid)
|
|
_ = s.net.killVMProcess(ctx, pid)
|
|
}
|
|
op.stage("cleanup_runtime")
|
|
if err := s.cleanupRuntime(ctx, vm, false); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
clearRuntimeTeardownState(&vm)
|
|
op.stage("delete_store_record")
|
|
if err := s.store.DeleteVM(ctx, vm.ID); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
if vm.Runtime.VMDir != "" {
|
|
op.stage("delete_vm_dir", "vm_dir", vm.Runtime.VMDir)
|
|
if err := os.RemoveAll(vm.Runtime.VMDir); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
}
|
|
// Drop any host-key pins. A future VM reusing this IP or name
|
|
// would otherwise trip the TOFU mismatch branch in
|
|
// TOFUHostKeyCallback and fail to connect.
|
|
removeVMKnownHosts(s.layout.KnownHostsPath, vm, s.logger)
|
|
return vm, nil
|
|
}
|