Add structured daemon lifecycle logs

VM start, image build, and network/setup failures were hard to diagnose because bangerd emitted almost no lifecycle logs and the Firecracker SDK logger was discarded. This adds a daemon-wide JSON logger with configurable log level so failures leave breadcrumbs instead of only side effects.

Log the main daemon and VM lifecycle stages, preserve raw Firecracker and image-build helper output in dedicated files, and include those log paths in daemon status and returned errors. Bridge SDK logrus output into the daemon logger at debug level so low-level Firecracker diagnostics are available without making normal info logs unreadable.

Validation: go test ./... and make build. Left unrelated worktree changes out of this commit, including internal/api/types.go, the deleted shell scripts, and my-rootfs.ext4.
This commit is contained in:
Thales Maciel 2026-03-16 16:16:28 -03:00
parent 5018bc6170
commit 644e60d739
No known key found for this signature in database
GPG key ID: 33112E6833C34679
13 changed files with 746 additions and 31 deletions

View file

@ -17,9 +17,17 @@ import (
"banger/internal/system"
)
func (d *Daemon) CreateVM(ctx context.Context, params api.VMCreateParams) (model.VMRecord, error) {
func (d *Daemon) CreateVM(ctx context.Context, params api.VMCreateParams) (vm model.VMRecord, err error) {
d.mu.Lock()
defer d.mu.Unlock()
op := d.beginOperation("vm.create")
defer func() {
if err != nil {
op.fail(err)
return
}
op.done(vmLogAttrs(vm)...)
}()
imageName := params.ImageName
if imageName == "" {
@ -29,6 +37,7 @@ func (d *Daemon) CreateVM(ctx context.Context, params api.VMCreateParams) (model
if err != nil {
return model.VMRecord{}, err
}
op.stage("image_resolved", imageLogAttrs(image)...)
name := strings.TrimSpace(params.Name)
if name == "" {
name, err = d.generateName(ctx)
@ -73,7 +82,7 @@ func (d *Daemon) CreateVM(ctx context.Context, params api.VMCreateParams) (model
WorkDiskSizeBytes: workDiskSize,
NATEnabled: params.NATEnabled,
}
vm := model.VMRecord{
vm = model.VMRecord{
ID: id,
Name: name,
ImageID: image.ID,
@ -96,6 +105,7 @@ func (d *Daemon) CreateVM(ctx context.Context, params api.VMCreateParams) (model
if err := d.store.UpsertVM(ctx, vm); err != nil {
return model.VMRecord{}, err
}
op.stage("persisted", vmLogAttrs(vm)...)
if params.NoStart {
vm.State = model.VMStateStopped
vm.Runtime.State = model.VMStateStopped
@ -119,25 +129,41 @@ func (d *Daemon) StartVM(ctx context.Context, idOrName string) (model.VMRecord,
return model.VMRecord{}, err
}
if vm.State == model.VMStateRunning && system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
if d.logger != nil {
d.logger.Info("vm already running", vmLogAttrs(vm)...)
}
return vm, nil
}
return d.startVMLocked(ctx, vm, image)
}
func (d *Daemon) startVMLocked(ctx context.Context, vm model.VMRecord, image model.Image) (model.VMRecord, error) {
func (d *Daemon) startVMLocked(ctx context.Context, vm model.VMRecord, image model.Image) (_ model.VMRecord, err error) {
op := d.beginOperation("vm.start", append(vmLogAttrs(vm), imageLogAttrs(image)...)...)
defer func() {
if err != nil {
err = annotateLogPath(err, vm.Runtime.LogPath)
op.fail(err, vmLogAttrs(vm)...)
return
}
op.done(vmLogAttrs(vm)...)
}()
op.stage("preflight")
if err := d.validateStartPrereqs(ctx, vm, image); err != nil {
return model.VMRecord{}, err
}
if err := os.MkdirAll(vm.Runtime.VMDir, 0o755); err != nil {
return model.VMRecord{}, err
}
op.stage("cleanup_runtime")
if err := d.cleanupRuntime(ctx, vm, true); err != nil {
return model.VMRecord{}, err
}
clearRuntimeHandles(&vm)
op.stage("bridge")
if err := d.ensureBridge(ctx); err != nil {
return model.VMRecord{}, err
}
op.stage("socket_dir")
if err := d.ensureSocketDir(); err != nil {
return model.VMRecord{}, err
}
@ -150,10 +176,12 @@ func (d *Daemon) startVMLocked(ctx context.Context, vm model.VMRecord, image mod
return model.VMRecord{}, err
}
op.stage("system_overlay", "overlay_path", vm.Runtime.SystemOverlay)
if err := d.ensureSystemOverlay(ctx, &vm); err != nil {
return model.VMRecord{}, err
}
op.stage("dm_snapshot", "dm_name", dmName)
handles, err := d.createDMSnapshot(ctx, image.RootfsPath, vm.Runtime.SystemOverlay, dmName)
if err != nil {
return model.VMRecord{}, err
@ -172,6 +200,7 @@ func (d *Daemon) startVMLocked(ctx context.Context, vm model.VMRecord, image mod
vm.State = model.VMStateError
vm.Runtime.State = model.VMStateError
vm.Runtime.LastError = err.Error()
op.stage("cleanup_after_failure", "error", err.Error())
if cleanupErr := d.cleanupRuntime(context.Background(), vm, true); cleanupErr != nil {
err = errors.Join(err, cleanupErr)
}
@ -180,23 +209,29 @@ func (d *Daemon) startVMLocked(ctx context.Context, vm model.VMRecord, image mod
return model.VMRecord{}, err
}
op.stage("patch_root_overlay")
if err := d.patchRootOverlay(ctx, vm, image); err != nil {
return cleanupOnErr(err)
}
op.stage("work_disk", "work_disk_path", vm.Runtime.WorkDiskPath)
if err := d.ensureWorkDisk(ctx, &vm); err != nil {
return cleanupOnErr(err)
}
op.stage("tap", "tap_device", tap)
if err := d.createTap(ctx, tap); err != nil {
return cleanupOnErr(err)
}
op.stage("metrics_file", "metrics_path", vm.Runtime.MetricsPath)
if err := os.WriteFile(vm.Runtime.MetricsPath, nil, 0o644); err != nil {
return cleanupOnErr(err)
}
op.stage("firecracker_binary")
fcPath, err := d.firecrackerBinary()
if err != nil {
return cleanupOnErr(err)
}
op.stage("firecracker_launch", "log_path", vm.Runtime.LogPath, "metrics_path", vm.Runtime.MetricsPath)
machine, err := firecracker.NewMachine(ctx, firecracker.MachineConfig{
BinaryPath: fcPath,
VMID: vm.ID,
@ -211,6 +246,7 @@ func (d *Daemon) startVMLocked(ctx context.Context, vm model.VMRecord, image mod
TapDevice: tap,
VCPUCount: vm.Spec.VCPUCount,
MemoryMiB: vm.Spec.MemoryMiB,
Logger: d.logger,
})
if err != nil {
return cleanupOnErr(err)
@ -220,32 +256,46 @@ func (d *Daemon) startVMLocked(ctx context.Context, vm model.VMRecord, image mod
return cleanupOnErr(err)
}
vm.Runtime.PID = d.resolveFirecrackerPID(ctx, machine, apiSock)
op.debugStage("firecracker_started", "pid", vm.Runtime.PID)
op.stage("socket_access", "api_socket", apiSock)
if err := d.ensureSocketAccess(ctx, apiSock); err != nil {
return cleanupOnErr(err)
}
op.stage("dns", "dns_name", vm.Runtime.DNSName)
if err := d.setDNS(ctx, vm.Name, vm.Runtime.GuestIP); err != nil {
return cleanupOnErr(err)
}
if vm.Spec.NATEnabled {
op.stage("nat")
if err := d.ensureNAT(ctx, vm, true); err != nil {
return cleanupOnErr(err)
}
}
system.TouchNow(&vm)
op.stage("persist")
if err := d.store.UpsertVM(ctx, vm); err != nil {
return cleanupOnErr(err)
}
return vm, nil
}
func (d *Daemon) StopVM(ctx context.Context, idOrName string) (model.VMRecord, error) {
func (d *Daemon) StopVM(ctx context.Context, idOrName string) (vm model.VMRecord, err error) {
d.mu.Lock()
defer d.mu.Unlock()
vm, err := d.FindVM(ctx, idOrName)
op := d.beginOperation("vm.stop", "vm_ref", idOrName)
defer func() {
if err != nil {
op.fail(err, vmLogAttrs(vm)...)
return
}
op.done(vmLogAttrs(vm)...)
}()
vm, err = d.FindVM(ctx, idOrName)
if err != nil {
return model.VMRecord{}, err
}
if vm.State != model.VMStateRunning || !system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
op.stage("cleanup_stale_runtime")
if err := d.cleanupRuntime(ctx, vm, true); err != nil {
return model.VMRecord{}, err
}
@ -257,12 +307,15 @@ func (d *Daemon) StopVM(ctx context.Context, idOrName string) (model.VMRecord, e
}
return vm, nil
}
op.stage("graceful_shutdown")
if err := d.sendCtrlAltDel(ctx, vm); err != nil {
return model.VMRecord{}, err
}
op.stage("wait_for_exit", "pid", vm.Runtime.PID)
if err := d.waitForExit(ctx, vm.Runtime.PID, vm.Runtime.APISockPath, 30*time.Second); err != nil {
return model.VMRecord{}, err
}
op.stage("cleanup_runtime")
if err := d.cleanupRuntime(ctx, vm, true); err != nil {
return model.VMRecord{}, err
}
@ -276,15 +329,24 @@ func (d *Daemon) StopVM(ctx context.Context, idOrName string) (model.VMRecord, e
return vm, nil
}
func (d *Daemon) KillVM(ctx context.Context, params api.VMKillParams) (model.VMRecord, error) {
func (d *Daemon) KillVM(ctx context.Context, params api.VMKillParams) (vm model.VMRecord, err error) {
d.mu.Lock()
defer d.mu.Unlock()
op := d.beginOperation("vm.kill", "vm_ref", params.IDOrName, "signal", params.Signal)
defer func() {
if err != nil {
op.fail(err, vmLogAttrs(vm)...)
return
}
op.done(vmLogAttrs(vm)...)
}()
vm, err := d.FindVM(ctx, params.IDOrName)
vm, err = d.FindVM(ctx, params.IDOrName)
if err != nil {
return model.VMRecord{}, err
}
if vm.State != model.VMStateRunning || !system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
op.stage("cleanup_stale_runtime")
if err := d.cleanupRuntime(ctx, vm, true); err != nil {
return model.VMRecord{}, err
}
@ -301,12 +363,15 @@ func (d *Daemon) KillVM(ctx context.Context, params api.VMKillParams) (model.VMR
if signal == "" {
signal = "TERM"
}
op.stage("send_signal", "pid", vm.Runtime.PID, "signal", signal)
if _, err := d.runner.RunSudo(ctx, "kill", "-"+signal, strconv.Itoa(vm.Runtime.PID)); err != nil {
return model.VMRecord{}, err
}
op.stage("wait_for_exit", "pid", vm.Runtime.PID)
if err := d.waitForExit(ctx, vm.Runtime.PID, vm.Runtime.APISockPath, 30*time.Second); err != nil {
return model.VMRecord{}, err
}
op.stage("cleanup_runtime")
if err := d.cleanupRuntime(ctx, vm, true); err != nil {
return model.VMRecord{}, err
}
@ -320,35 +385,59 @@ func (d *Daemon) KillVM(ctx context.Context, params api.VMKillParams) (model.VMR
return vm, nil
}
func (d *Daemon) RestartVM(ctx context.Context, idOrName string) (model.VMRecord, error) {
vm, err := d.StopVM(ctx, idOrName)
func (d *Daemon) RestartVM(ctx context.Context, idOrName string) (vm model.VMRecord, err error) {
op := d.beginOperation("vm.restart", "vm_ref", idOrName)
defer func() {
if err != nil {
op.fail(err, vmLogAttrs(vm)...)
return
}
op.done(vmLogAttrs(vm)...)
}()
op.stage("stop")
vm, err = d.StopVM(ctx, idOrName)
if err != nil {
return model.VMRecord{}, err
}
op.stage("start", vmLogAttrs(vm)...)
return d.StartVM(ctx, vm.ID)
}
func (d *Daemon) DeleteVM(ctx context.Context, idOrName string) (model.VMRecord, error) {
func (d *Daemon) DeleteVM(ctx context.Context, idOrName string) (vm model.VMRecord, err error) {
d.mu.Lock()
defer d.mu.Unlock()
vm, err := d.FindVM(ctx, idOrName)
op := d.beginOperation("vm.delete", "vm_ref", idOrName)
defer func() {
if err != nil {
op.fail(err, vmLogAttrs(vm)...)
return
}
op.done(vmLogAttrs(vm)...)
}()
vm, err = d.FindVM(ctx, idOrName)
if err != nil {
return model.VMRecord{}, err
}
if vm.State == model.VMStateRunning && system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
op.stage("kill_running_vm", "pid", vm.Runtime.PID)
_ = d.killVMProcess(ctx, vm.Runtime.PID)
}
op.stage("cleanup_runtime")
if err := d.cleanupRuntime(ctx, vm, false); err != nil {
return model.VMRecord{}, err
}
if vm.Spec.NATEnabled {
op.debugStage("disable_nat")
_ = d.ensureNAT(ctx, vm, false)
}
op.debugStage("remove_dns", "dns_name", vm.Runtime.DNSName)
_ = d.removeDNS(ctx, vm.Runtime.DNSName)
op.stage("delete_store_record")
if err := d.store.DeleteVM(ctx, vm.ID); err != nil {
return model.VMRecord{}, err
}
if vm.Runtime.VMDir != "" {
op.stage("delete_vm_dir", "vm_dir", vm.Runtime.VMDir)
if err := os.RemoveAll(vm.Runtime.VMDir); err != nil {
return model.VMRecord{}, err
}
@ -356,10 +445,18 @@ func (d *Daemon) DeleteVM(ctx context.Context, idOrName string) (model.VMRecord,
return vm, nil
}
func (d *Daemon) SetVM(ctx context.Context, params api.VMSetParams) (model.VMRecord, error) {
func (d *Daemon) SetVM(ctx context.Context, params api.VMSetParams) (vm model.VMRecord, err error) {
d.mu.Lock()
defer d.mu.Unlock()
vm, err := d.FindVM(ctx, params.IDOrName)
op := d.beginOperation("vm.set", "vm_ref", params.IDOrName)
defer func() {
if err != nil {
op.fail(err, vmLogAttrs(vm)...)
return
}
op.done(vmLogAttrs(vm)...)
}()
vm, err = d.FindVM(ctx, params.IDOrName)
if err != nil {
return model.VMRecord{}, err
}
@ -368,12 +465,14 @@ func (d *Daemon) SetVM(ctx context.Context, params api.VMSetParams) (model.VMRec
if running {
return model.VMRecord{}, errors.New("vcpu changes require the VM to be stopped")
}
op.stage("update_vcpu", "vcpu_count", *params.VCPUCount)
vm.Spec.VCPUCount = *params.VCPUCount
}
if params.MemoryMiB != nil {
if running {
return model.VMRecord{}, errors.New("memory changes require the VM to be stopped")
}
op.stage("update_memory", "memory_mib", *params.MemoryMiB)
vm.Spec.MemoryMiB = *params.MemoryMiB
}
if params.WorkDiskSize != "" {
@ -389,6 +488,7 @@ func (d *Daemon) SetVM(ctx context.Context, params api.VMSetParams) (model.VMRec
}
if size > vm.Spec.WorkDiskSizeBytes {
if exists(vm.Runtime.WorkDiskPath) {
op.stage("resize_work_disk", "from_bytes", vm.Spec.WorkDiskSizeBytes, "to_bytes", size)
if err := d.validateWorkDiskResizePrereqs(); err != nil {
return model.VMRecord{}, err
}
@ -400,6 +500,7 @@ func (d *Daemon) SetVM(ctx context.Context, params api.VMSetParams) (model.VMRec
}
}
if params.NATEnabled != nil {
op.stage("update_nat", "nat_enabled", *params.NATEnabled)
vm.Spec.NATEnabled = *params.NATEnabled
if running {
if err := d.ensureNAT(ctx, vm, *params.NATEnabled); err != nil {
@ -426,6 +527,9 @@ func (d *Daemon) GetVMStats(ctx context.Context, idOrName string) (model.VMRecor
vm.Stats = stats
vm.UpdatedAt = model.Now()
_ = d.store.UpsertVM(ctx, vm)
if d.logger != nil {
d.logger.Debug("vm stats collected", append(vmLogAttrs(vm), "rss_bytes", stats.RSSBytes, "vsz_bytes", stats.VSZBytes, "cpu_percent", stats.CPUPercent)...)
}
}
return vm, vm.Stats, nil
}
@ -443,6 +547,9 @@ func (d *Daemon) pollStats(ctx context.Context) error {
}
stats, err := d.collectStats(ctx, vm)
if err != nil {
if d.logger != nil {
d.logger.Debug("vm stats collection failed", append(vmLogAttrs(vm), "error", err.Error())...)
}
continue
}
vm.Stats = stats
@ -452,10 +559,18 @@ func (d *Daemon) pollStats(ctx context.Context) error {
return nil
}
func (d *Daemon) stopStaleVMs(ctx context.Context) error {
func (d *Daemon) stopStaleVMs(ctx context.Context) (err error) {
if d.config.AutoStopStaleAfter <= 0 {
return nil
}
op := d.beginOperation("vm.stop_stale")
defer func() {
if err != nil {
op.fail(err)
return
}
op.done()
}()
d.mu.Lock()
defer d.mu.Unlock()
vms, err := d.store.ListVMs(ctx)
@ -470,6 +585,7 @@ func (d *Daemon) stopStaleVMs(ctx context.Context) error {
if now.Sub(vm.LastTouchedAt) < d.config.AutoStopStaleAfter {
continue
}
op.stage("stopping_vm", vmLogAttrs(vm)...)
_ = d.sendCtrlAltDel(ctx, vm)
_ = d.waitForExit(ctx, vm.Runtime.PID, vm.Runtime.APISockPath, 10*time.Second)
_ = d.cleanupRuntime(ctx, vm, true)
@ -635,7 +751,7 @@ func (d *Daemon) sendCtrlAltDel(ctx context.Context, vm model.VMRecord) error {
if err := d.ensureSocketAccess(ctx, vm.Runtime.APISockPath); err != nil {
return err
}
client := firecracker.New(vm.Runtime.APISockPath)
client := firecracker.New(vm.Runtime.APISockPath, d.logger)
return client.SendCtrlAltDel(ctx)
}
@ -657,6 +773,9 @@ func (d *Daemon) waitForExit(ctx context.Context, pid int, apiSock string, timeo
}
func (d *Daemon) cleanupRuntime(ctx context.Context, vm model.VMRecord, preserveDisks bool) error {
if d.logger != nil {
d.logger.Debug("cleanup runtime", append(vmLogAttrs(vm), "preserve_disks", preserveDisks)...)
}
if vm.Runtime.PID > 0 && system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
_ = d.killVMProcess(ctx, vm.Runtime.PID)
}
@ -699,6 +818,9 @@ func (d *Daemon) setDNS(ctx context.Context, vmName, guestIP string) error {
}
}
_, err := d.runner.Run(ctx, d.mapdnsBinary(), d.mapdnsArgs("set", vmName+".vm", guestIP)...)
if err == nil && d.logger != nil {
d.logger.Debug("dns record set", "dns_name", vmName+".vm", "guest_ip", guestIP)
}
return err
}
@ -710,6 +832,9 @@ func (d *Daemon) removeDNS(ctx context.Context, dnsName string) error {
if err != nil && strings.Contains(err.Error(), "not found") {
return nil
}
if err == nil && d.logger != nil {
d.logger.Debug("dns record removed", "dns_name", dnsName)
}
return err
}