daemon: surface previously-swallowed errors at warn

Three recovery-path errors were silently dropped:

- vm_lifecycle.go startVMLocked persisted the VMStateError record
  with `_ = s.store.UpsertVM(...)`. If the persist failed the user
  saw the original start error but operators had no way to find
  out the store had also drifted out of sync.
- vm_lifecycle.go deleteVMLocked killed the firecracker process
  with `_ = s.net.killVMProcess(...)`. cleanupRuntime tears it
  down regardless, so the explicit kill is best-effort, but a
  permission-denied / EPERM was still worth logging.
- capabilities.go cleanupPreparedCapabilities collected per-cap
  errors with errors.Join. Callers get the aggregated value but
  couldn't tell which capability failed when more than one did.

All three now log Warn before the original behaviour continues.
The aggregate return value, control flow, and user-visible error
strings are unchanged — this is purely a "less silence in the
journal" pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Thales Maciel 2026-04-26 22:30:51 -03:00
parent 71a332a6a1
commit fa4292756d
No known key found for this signature in database
GPG key ID: 33112E6833C34679
2 changed files with 28 additions and 3 deletions

View file

@ -144,7 +144,16 @@ func (d *Daemon) cleanupPreparedCapabilities(ctx context.Context, vm *model.VMRe
if !ok { if !ok {
continue continue
} }
err = joinErr(err, hook.Cleanup(ctx, *vm)) cleanupErr := hook.Cleanup(ctx, *vm)
if cleanupErr != nil && d.logger != nil {
// Log per-capability cleanup failures. The aggregate
// errors.Join return value is still the contract for
// callers, but a multi-failure cleanup hides which
// capability misbehaved unless we surface each one
// individually here.
d.logger.Warn("capability cleanup failed", append(vmLogAttrs(*vm), "capability", capabilities[index].Name(), "error", cleanupErr.Error())...)
}
err = joinErr(err, cleanupErr)
} }
return err return err
} }

View file

@ -82,7 +82,16 @@ func (s *VMService) startVMLocked(ctx context.Context, vm model.VMRecord, image
clearRuntimeTeardownState(&vm) clearRuntimeTeardownState(&vm)
s.clearVMHandles(vm) s.clearVMHandles(vm)
if s.store != nil { if s.store != nil {
_ = s.store.UpsertVM(context.Background(), vm) // We're in the recovery path: the start has already
// failed, and the user will see runErr. A persist
// failure here only affects what 'banger vm show'
// reads on the next call, so we keep returning runErr
// — but a silent swallow leaves operators unable to
// debug "why does the record still say running?". Log
// at warn instead.
if persistErr := s.store.UpsertVM(context.Background(), vm); persistErr != nil && s.logger != nil {
s.logger.Warn("persist vm error state failed", append(vmLogAttrs(vm), "error", persistErr.Error())...)
}
} }
return model.VMRecord{}, runErr return model.VMRecord{}, runErr
} }
@ -255,7 +264,14 @@ func (s *VMService) deleteVMLocked(ctx context.Context, current model.VMRecord)
if s.vmAlive(vm) { if s.vmAlive(vm) {
pid := s.vmHandles(vm.ID).PID pid := s.vmHandles(vm.ID).PID
op.stage("kill_running_vm", "pid", pid) op.stage("kill_running_vm", "pid", pid)
_ = s.net.killVMProcess(ctx, pid) // Best-effort: cleanupRuntime below tears the process down
// regardless. A kill failure here only matters when it
// surfaces something operators should see (permission
// denied, etc.), so promote it from a silent _ to a Warn
// without changing the control flow.
if killErr := s.net.killVMProcess(ctx, pid); killErr != nil && s.logger != nil {
s.logger.Warn("kill vm process during delete failed", append(vmLogAttrs(vm), "pid", pid, "error", killErr.Error())...)
}
} }
op.stage("cleanup_runtime") op.stage("cleanup_runtime")
if err := s.cleanupRuntime(ctx, vm, false); err != nil { if err := s.cleanupRuntime(ctx, vm, false); err != nil {