Add structured daemon lifecycle logs
VM start, image build, and network/setup failures were hard to diagnose because bangerd emitted almost no lifecycle logs and the Firecracker SDK logger was discarded. This adds a daemon-wide JSON logger with configurable log level so failures leave breadcrumbs instead of only side effects. Log the main daemon and VM lifecycle stages, preserve raw Firecracker and image-build helper output in dedicated files, and include those log paths in daemon status and returned errors. Bridge SDK logrus output into the daemon logger at debug level so low-level Firecracker diagnostics are available without making normal info logs unreadable. Validation: go test ./... and make build. Left unrelated worktree changes out of this commit, including internal/api/types.go, the deleted shell scripts, and my-rootfs.ext4.
This commit is contained in:
parent
5018bc6170
commit
644e60d739
13 changed files with 746 additions and 31 deletions
|
|
@ -6,6 +6,7 @@ import (
|
|||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"net"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
|
@ -27,6 +28,7 @@ type Daemon struct {
|
|||
config model.DaemonConfig
|
||||
store *store.Store
|
||||
runner system.CommandRunner
|
||||
logger *slog.Logger
|
||||
mu sync.Mutex
|
||||
closing chan struct{}
|
||||
once sync.Once
|
||||
|
|
@ -46,6 +48,11 @@ func Open(ctx context.Context) (*Daemon, error) {
|
|||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
logger, normalizedLevel, err := newDaemonLogger(os.Stderr, cfg.LogLevel)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
cfg.LogLevel = normalizedLevel
|
||||
db, err := store.Open(layout.DBPath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
|
@ -55,13 +62,17 @@ func Open(ctx context.Context) (*Daemon, error) {
|
|||
config: cfg,
|
||||
store: db,
|
||||
runner: system.NewRunner(),
|
||||
logger: logger,
|
||||
closing: make(chan struct{}),
|
||||
pid: os.Getpid(),
|
||||
}
|
||||
d.logger.Info("daemon opened", "socket", layout.SocketPath, "state_dir", layout.StateDir, "runtime_dir", cfg.RuntimeDir, "log_level", cfg.LogLevel)
|
||||
if err := d.ensureDefaultImage(ctx); err != nil {
|
||||
d.logger.Error("daemon open failed", "stage", "ensure_default_image", "error", err.Error())
|
||||
return nil, err
|
||||
}
|
||||
if err := d.reconcile(ctx); err != nil {
|
||||
d.logger.Error("daemon open failed", "stage", "reconcile", "error", err.Error())
|
||||
return nil, err
|
||||
}
|
||||
return d, nil
|
||||
|
|
@ -70,6 +81,9 @@ func Open(ctx context.Context) (*Daemon, error) {
|
|||
func (d *Daemon) Close() error {
|
||||
var err error
|
||||
d.once.Do(func() {
|
||||
if d.logger != nil {
|
||||
d.logger.Info("daemon closing")
|
||||
}
|
||||
close(d.closing)
|
||||
if d.listener != nil {
|
||||
_ = d.listener.Close()
|
||||
|
|
@ -83,6 +97,9 @@ func (d *Daemon) Serve(ctx context.Context) error {
|
|||
_ = os.Remove(d.layout.SocketPath)
|
||||
listener, err := net.Listen("unix", d.layout.SocketPath)
|
||||
if err != nil {
|
||||
if d.logger != nil {
|
||||
d.logger.Error("daemon listen failed", "socket", d.layout.SocketPath, "error", err.Error())
|
||||
}
|
||||
return err
|
||||
}
|
||||
d.listener = listener
|
||||
|
|
@ -91,6 +108,9 @@ func (d *Daemon) Serve(ctx context.Context) error {
|
|||
if err := os.Chmod(d.layout.SocketPath, 0o600); err != nil {
|
||||
return err
|
||||
}
|
||||
if d.logger != nil {
|
||||
d.logger.Info("daemon serving", "socket", d.layout.SocketPath, "pid", d.pid)
|
||||
}
|
||||
|
||||
go d.backgroundLoop()
|
||||
|
||||
|
|
@ -105,9 +125,15 @@ func (d *Daemon) Serve(ctx context.Context) error {
|
|||
default:
|
||||
}
|
||||
if ne, ok := err.(net.Error); ok && ne.Temporary() {
|
||||
if d.logger != nil {
|
||||
d.logger.Warn("daemon accept temporary failure", "error", err.Error())
|
||||
}
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
continue
|
||||
}
|
||||
if d.logger != nil {
|
||||
d.logger.Error("daemon accept failed", "error", err.Error())
|
||||
}
|
||||
return err
|
||||
}
|
||||
go d.handleConn(conn)
|
||||
|
|
@ -118,6 +144,9 @@ func (d *Daemon) handleConn(conn net.Conn) {
|
|||
defer conn.Close()
|
||||
var req rpc.Request
|
||||
if err := json.NewDecoder(bufio.NewReader(conn)).Decode(&req); err != nil {
|
||||
if d.logger != nil {
|
||||
d.logger.Warn("daemon request decode failed", "remote", conn.RemoteAddr().String(), "error", err.Error())
|
||||
}
|
||||
_ = json.NewEncoder(conn).Encode(rpc.NewError("bad_request", err.Error()))
|
||||
return
|
||||
}
|
||||
|
|
@ -169,6 +198,13 @@ func (d *Daemon) dispatch(req rpc.Request) rpc.Response {
|
|||
}
|
||||
vm, err := d.StopVM(ctx, params.IDOrName)
|
||||
return marshalResultOrError(api.VMShowResult{VM: vm}, err)
|
||||
case "vm.kill":
|
||||
params, err := rpc.DecodeParams[api.VMKillParams](req)
|
||||
if err != nil {
|
||||
return rpc.NewError("bad_request", err.Error())
|
||||
}
|
||||
vm, err := d.KillVM(ctx, params)
|
||||
return marshalResultOrError(api.VMShowResult{VM: vm}, err)
|
||||
case "vm.restart":
|
||||
params, err := rpc.DecodeParams[api.VMRefParams](req)
|
||||
if err != nil {
|
||||
|
|
@ -259,9 +295,13 @@ func (d *Daemon) backgroundLoop() {
|
|||
case <-d.closing:
|
||||
return
|
||||
case <-statsTicker.C:
|
||||
_ = d.pollStats(context.Background())
|
||||
if err := d.pollStats(context.Background()); err != nil && d.logger != nil {
|
||||
d.logger.Error("background stats poll failed", "error", err.Error())
|
||||
}
|
||||
case <-staleTicker.C:
|
||||
_ = d.stopStaleVMs(context.Background())
|
||||
if err := d.stopStaleVMs(context.Background()); err != nil && d.logger != nil {
|
||||
d.logger.Error("background stale sweep failed", "error", err.Error())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -271,12 +311,18 @@ func (d *Daemon) ensureDefaultImage(ctx context.Context) error {
|
|||
return nil
|
||||
}
|
||||
if _, err := d.store.GetImageByName(ctx, d.config.DefaultImageName); err == nil {
|
||||
if d.logger != nil {
|
||||
d.logger.Debug("default image already registered", "image_name", d.config.DefaultImageName)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
rootfs := d.config.DefaultRootfs
|
||||
kernel := d.config.DefaultKernel
|
||||
initrd := d.config.DefaultInitrd
|
||||
if !exists(rootfs) || !exists(kernel) {
|
||||
if d.logger != nil {
|
||||
d.logger.Debug("default image skipped", "image_name", d.config.DefaultImageName, "rootfs_path", rootfs, "kernel_path", kernel)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
id, err := model.NewID()
|
||||
|
|
@ -297,13 +343,20 @@ func (d *Daemon) ensureDefaultImage(ctx context.Context) error {
|
|||
CreatedAt: now,
|
||||
UpdatedAt: now,
|
||||
}
|
||||
return d.store.UpsertImage(ctx, image)
|
||||
if err := d.store.UpsertImage(ctx, image); err != nil {
|
||||
return err
|
||||
}
|
||||
if d.logger != nil {
|
||||
d.logger.Info("default image registered", append(imageLogAttrs(image), "managed", image.Managed)...)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (d *Daemon) reconcile(ctx context.Context) error {
|
||||
op := d.beginOperation("daemon.reconcile")
|
||||
vms, err := d.store.ListVMs(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
return op.fail(err)
|
||||
}
|
||||
for _, vm := range vms {
|
||||
if vm.State != model.VMStateRunning {
|
||||
|
|
@ -312,6 +365,7 @@ func (d *Daemon) reconcile(ctx context.Context) error {
|
|||
if system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
|
||||
continue
|
||||
}
|
||||
op.stage("stale_vm", vmLogAttrs(vm)...)
|
||||
_ = d.cleanupRuntime(ctx, vm, true)
|
||||
vm.State = model.VMStateStopped
|
||||
vm.Runtime.State = model.VMStateStopped
|
||||
|
|
@ -324,9 +378,10 @@ func (d *Daemon) reconcile(ctx context.Context) error {
|
|||
vm.Runtime.DMDev = ""
|
||||
vm.UpdatedAt = model.Now()
|
||||
if err := d.store.UpsertVM(ctx, vm); err != nil {
|
||||
return err
|
||||
return op.fail(err, vmLogAttrs(vm)...)
|
||||
}
|
||||
}
|
||||
op.done()
|
||||
return nil
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue