From fa4292756dc2ba7e0871918eac6d905c7b61a9d0 Mon Sep 17 00:00:00 2001 From: Thales Maciel Date: Sun, 26 Apr 2026 22:30:51 -0300 Subject: [PATCH] daemon: surface previously-swallowed errors at warn MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three recovery-path errors were silently dropped: - vm_lifecycle.go startVMLocked persisted the VMStateError record with `_ = s.store.UpsertVM(...)`. If the persist failed the user saw the original start error but operators had no way to find out the store had also drifted out of sync. - vm_lifecycle.go deleteVMLocked killed the firecracker process with `_ = s.net.killVMProcess(...)`. cleanupRuntime tears it down regardless, so the explicit kill is best-effort, but a permission-denied / EPERM was still worth logging. - capabilities.go cleanupPreparedCapabilities collected per-cap errors with errors.Join. Callers get the aggregated value but couldn't tell which capability failed when more than one did. All three now log Warn before the original behaviour continues. The aggregate return value, control flow, and user-visible error strings are unchanged — this is purely a "less silence in the journal" pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/daemon/capabilities.go | 11 ++++++++++- internal/daemon/vm_lifecycle.go | 20 ++++++++++++++++++-- 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/internal/daemon/capabilities.go b/internal/daemon/capabilities.go index b730591..89fa5e9 100644 --- a/internal/daemon/capabilities.go +++ b/internal/daemon/capabilities.go @@ -144,7 +144,16 @@ func (d *Daemon) cleanupPreparedCapabilities(ctx context.Context, vm *model.VMRe if !ok { continue } - err = joinErr(err, hook.Cleanup(ctx, *vm)) + cleanupErr := hook.Cleanup(ctx, *vm) + if cleanupErr != nil && d.logger != nil { + // Log per-capability cleanup failures. The aggregate + // errors.Join return value is still the contract for + // callers, but a multi-failure cleanup hides which + // capability misbehaved unless we surface each one + // individually here. + d.logger.Warn("capability cleanup failed", append(vmLogAttrs(*vm), "capability", capabilities[index].Name(), "error", cleanupErr.Error())...) + } + err = joinErr(err, cleanupErr) } return err } diff --git a/internal/daemon/vm_lifecycle.go b/internal/daemon/vm_lifecycle.go index 17e83e8..6f4fcf5 100644 --- a/internal/daemon/vm_lifecycle.go +++ b/internal/daemon/vm_lifecycle.go @@ -82,7 +82,16 @@ func (s *VMService) startVMLocked(ctx context.Context, vm model.VMRecord, image clearRuntimeTeardownState(&vm) s.clearVMHandles(vm) if s.store != nil { - _ = s.store.UpsertVM(context.Background(), vm) + // We're in the recovery path: the start has already + // failed, and the user will see runErr. A persist + // failure here only affects what 'banger vm show' + // reads on the next call, so we keep returning runErr + // — but a silent swallow leaves operators unable to + // debug "why does the record still say running?". Log + // at warn instead. + if persistErr := s.store.UpsertVM(context.Background(), vm); persistErr != nil && s.logger != nil { + s.logger.Warn("persist vm error state failed", append(vmLogAttrs(vm), "error", persistErr.Error())...) + } } return model.VMRecord{}, runErr } @@ -255,7 +264,14 @@ func (s *VMService) deleteVMLocked(ctx context.Context, current model.VMRecord) if s.vmAlive(vm) { pid := s.vmHandles(vm.ID).PID op.stage("kill_running_vm", "pid", pid) - _ = s.net.killVMProcess(ctx, pid) + // Best-effort: cleanupRuntime below tears the process down + // regardless. A kill failure here only matters when it + // surfaces something operators should see (permission + // denied, etc.), so promote it from a silent _ to a Warn + // without changing the control flow. + if killErr := s.net.killVMProcess(ctx, pid); killErr != nil && s.logger != nil { + s.logger.Warn("kill vm process during delete failed", append(vmLogAttrs(vm), "pid", pid, "error", killErr.Error())...) + } } op.stage("cleanup_runtime") if err := s.cleanupRuntime(ctx, vm, false); err != nil {