VM start, image build, and network/setup failures were hard to diagnose because bangerd emitted almost no lifecycle logs and the Firecracker SDK logger was discarded. This adds a daemon-wide JSON logger with configurable log level so failures leave breadcrumbs instead of only side effects. Log the main daemon and VM lifecycle stages, preserve raw Firecracker and image-build helper output in dedicated files, and include those log paths in daemon status and returned errors. Bridge SDK logrus output into the daemon logger at debug level so low-level Firecracker diagnostics are available without making normal info logs unreadable. Validation: go test ./... and make build. Left unrelated worktree changes out of this commit, including internal/api/types.go, the deleted shell scripts, and my-rootfs.ext4.
475 lines
13 KiB
Go
475 lines
13 KiB
Go
package daemon
|
|
|
|
import (
|
|
"bufio"
|
|
"context"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"log/slog"
|
|
"net"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"banger/internal/api"
|
|
"banger/internal/config"
|
|
"banger/internal/model"
|
|
"banger/internal/paths"
|
|
"banger/internal/rpc"
|
|
"banger/internal/store"
|
|
"banger/internal/system"
|
|
)
|
|
|
|
type Daemon struct {
|
|
layout paths.Layout
|
|
config model.DaemonConfig
|
|
store *store.Store
|
|
runner system.CommandRunner
|
|
logger *slog.Logger
|
|
mu sync.Mutex
|
|
closing chan struct{}
|
|
once sync.Once
|
|
pid int
|
|
listener net.Listener
|
|
}
|
|
|
|
func Open(ctx context.Context) (*Daemon, error) {
|
|
layout, err := paths.Resolve()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if err := paths.Ensure(layout); err != nil {
|
|
return nil, err
|
|
}
|
|
cfg, err := config.Load(layout)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
logger, normalizedLevel, err := newDaemonLogger(os.Stderr, cfg.LogLevel)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
cfg.LogLevel = normalizedLevel
|
|
db, err := store.Open(layout.DBPath)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
d := &Daemon{
|
|
layout: layout,
|
|
config: cfg,
|
|
store: db,
|
|
runner: system.NewRunner(),
|
|
logger: logger,
|
|
closing: make(chan struct{}),
|
|
pid: os.Getpid(),
|
|
}
|
|
d.logger.Info("daemon opened", "socket", layout.SocketPath, "state_dir", layout.StateDir, "runtime_dir", cfg.RuntimeDir, "log_level", cfg.LogLevel)
|
|
if err := d.ensureDefaultImage(ctx); err != nil {
|
|
d.logger.Error("daemon open failed", "stage", "ensure_default_image", "error", err.Error())
|
|
return nil, err
|
|
}
|
|
if err := d.reconcile(ctx); err != nil {
|
|
d.logger.Error("daemon open failed", "stage", "reconcile", "error", err.Error())
|
|
return nil, err
|
|
}
|
|
return d, nil
|
|
}
|
|
|
|
func (d *Daemon) Close() error {
|
|
var err error
|
|
d.once.Do(func() {
|
|
if d.logger != nil {
|
|
d.logger.Info("daemon closing")
|
|
}
|
|
close(d.closing)
|
|
if d.listener != nil {
|
|
_ = d.listener.Close()
|
|
}
|
|
err = d.store.Close()
|
|
})
|
|
return err
|
|
}
|
|
|
|
func (d *Daemon) Serve(ctx context.Context) error {
|
|
_ = os.Remove(d.layout.SocketPath)
|
|
listener, err := net.Listen("unix", d.layout.SocketPath)
|
|
if err != nil {
|
|
if d.logger != nil {
|
|
d.logger.Error("daemon listen failed", "socket", d.layout.SocketPath, "error", err.Error())
|
|
}
|
|
return err
|
|
}
|
|
d.listener = listener
|
|
defer listener.Close()
|
|
defer os.Remove(d.layout.SocketPath)
|
|
if err := os.Chmod(d.layout.SocketPath, 0o600); err != nil {
|
|
return err
|
|
}
|
|
if d.logger != nil {
|
|
d.logger.Info("daemon serving", "socket", d.layout.SocketPath, "pid", d.pid)
|
|
}
|
|
|
|
go d.backgroundLoop()
|
|
|
|
for {
|
|
conn, err := listener.Accept()
|
|
if err != nil {
|
|
select {
|
|
case <-ctx.Done():
|
|
return nil
|
|
case <-d.closing:
|
|
return nil
|
|
default:
|
|
}
|
|
if ne, ok := err.(net.Error); ok && ne.Temporary() {
|
|
if d.logger != nil {
|
|
d.logger.Warn("daemon accept temporary failure", "error", err.Error())
|
|
}
|
|
time.Sleep(100 * time.Millisecond)
|
|
continue
|
|
}
|
|
if d.logger != nil {
|
|
d.logger.Error("daemon accept failed", "error", err.Error())
|
|
}
|
|
return err
|
|
}
|
|
go d.handleConn(conn)
|
|
}
|
|
}
|
|
|
|
func (d *Daemon) handleConn(conn net.Conn) {
|
|
defer conn.Close()
|
|
var req rpc.Request
|
|
if err := json.NewDecoder(bufio.NewReader(conn)).Decode(&req); err != nil {
|
|
if d.logger != nil {
|
|
d.logger.Warn("daemon request decode failed", "remote", conn.RemoteAddr().String(), "error", err.Error())
|
|
}
|
|
_ = json.NewEncoder(conn).Encode(rpc.NewError("bad_request", err.Error()))
|
|
return
|
|
}
|
|
resp := d.dispatch(req)
|
|
_ = json.NewEncoder(conn).Encode(resp)
|
|
}
|
|
|
|
func (d *Daemon) dispatch(req rpc.Request) rpc.Response {
|
|
if req.Version != rpc.Version {
|
|
return rpc.NewError("bad_version", fmt.Sprintf("unsupported version %d", req.Version))
|
|
}
|
|
ctx := context.Background()
|
|
switch req.Method {
|
|
case "ping":
|
|
result, _ := rpc.NewResult(api.PingResult{Status: "ok", PID: d.pid})
|
|
return result
|
|
case "shutdown":
|
|
go d.Close()
|
|
result, _ := rpc.NewResult(api.ShutdownResult{Status: "stopping"})
|
|
return result
|
|
case "vm.create":
|
|
params, err := rpc.DecodeParams[api.VMCreateParams](req)
|
|
if err != nil {
|
|
return rpc.NewError("bad_request", err.Error())
|
|
}
|
|
vm, err := d.CreateVM(ctx, params)
|
|
return marshalResultOrError(api.VMShowResult{VM: vm}, err)
|
|
case "vm.list":
|
|
vms, err := d.store.ListVMs(ctx)
|
|
return marshalResultOrError(api.VMListResult{VMs: vms}, err)
|
|
case "vm.show":
|
|
params, err := rpc.DecodeParams[api.VMRefParams](req)
|
|
if err != nil {
|
|
return rpc.NewError("bad_request", err.Error())
|
|
}
|
|
vm, err := d.FindVM(ctx, params.IDOrName)
|
|
return marshalResultOrError(api.VMShowResult{VM: vm}, err)
|
|
case "vm.start":
|
|
params, err := rpc.DecodeParams[api.VMRefParams](req)
|
|
if err != nil {
|
|
return rpc.NewError("bad_request", err.Error())
|
|
}
|
|
vm, err := d.StartVM(ctx, params.IDOrName)
|
|
return marshalResultOrError(api.VMShowResult{VM: vm}, err)
|
|
case "vm.stop":
|
|
params, err := rpc.DecodeParams[api.VMRefParams](req)
|
|
if err != nil {
|
|
return rpc.NewError("bad_request", err.Error())
|
|
}
|
|
vm, err := d.StopVM(ctx, params.IDOrName)
|
|
return marshalResultOrError(api.VMShowResult{VM: vm}, err)
|
|
case "vm.kill":
|
|
params, err := rpc.DecodeParams[api.VMKillParams](req)
|
|
if err != nil {
|
|
return rpc.NewError("bad_request", err.Error())
|
|
}
|
|
vm, err := d.KillVM(ctx, params)
|
|
return marshalResultOrError(api.VMShowResult{VM: vm}, err)
|
|
case "vm.restart":
|
|
params, err := rpc.DecodeParams[api.VMRefParams](req)
|
|
if err != nil {
|
|
return rpc.NewError("bad_request", err.Error())
|
|
}
|
|
vm, err := d.RestartVM(ctx, params.IDOrName)
|
|
return marshalResultOrError(api.VMShowResult{VM: vm}, err)
|
|
case "vm.delete":
|
|
params, err := rpc.DecodeParams[api.VMRefParams](req)
|
|
if err != nil {
|
|
return rpc.NewError("bad_request", err.Error())
|
|
}
|
|
vm, err := d.DeleteVM(ctx, params.IDOrName)
|
|
return marshalResultOrError(api.VMShowResult{VM: vm}, err)
|
|
case "vm.set":
|
|
params, err := rpc.DecodeParams[api.VMSetParams](req)
|
|
if err != nil {
|
|
return rpc.NewError("bad_request", err.Error())
|
|
}
|
|
vm, err := d.SetVM(ctx, params)
|
|
return marshalResultOrError(api.VMShowResult{VM: vm}, err)
|
|
case "vm.stats":
|
|
params, err := rpc.DecodeParams[api.VMRefParams](req)
|
|
if err != nil {
|
|
return rpc.NewError("bad_request", err.Error())
|
|
}
|
|
vm, stats, err := d.GetVMStats(ctx, params.IDOrName)
|
|
return marshalResultOrError(api.VMStatsResult{VM: vm, Stats: stats}, err)
|
|
case "vm.logs":
|
|
params, err := rpc.DecodeParams[api.VMRefParams](req)
|
|
if err != nil {
|
|
return rpc.NewError("bad_request", err.Error())
|
|
}
|
|
vm, err := d.FindVM(ctx, params.IDOrName)
|
|
if err != nil {
|
|
return rpc.NewError("not_found", err.Error())
|
|
}
|
|
return marshalResultOrError(api.VMLogsResult{LogPath: vm.Runtime.LogPath}, nil)
|
|
case "vm.ssh":
|
|
params, err := rpc.DecodeParams[api.VMRefParams](req)
|
|
if err != nil {
|
|
return rpc.NewError("bad_request", err.Error())
|
|
}
|
|
vm, err := d.TouchVM(ctx, params.IDOrName)
|
|
if err != nil {
|
|
return rpc.NewError("not_found", err.Error())
|
|
}
|
|
if vm.State != model.VMStateRunning || !system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
|
|
return rpc.NewError("not_running", fmt.Sprintf("vm %s is not running", vm.Name))
|
|
}
|
|
return marshalResultOrError(api.VMSSHResult{Name: vm.Name, GuestIP: vm.Runtime.GuestIP}, nil)
|
|
case "image.list":
|
|
images, err := d.store.ListImages(ctx)
|
|
return marshalResultOrError(api.ImageListResult{Images: images}, err)
|
|
case "image.show":
|
|
params, err := rpc.DecodeParams[api.ImageRefParams](req)
|
|
if err != nil {
|
|
return rpc.NewError("bad_request", err.Error())
|
|
}
|
|
image, err := d.FindImage(ctx, params.IDOrName)
|
|
return marshalResultOrError(api.ImageShowResult{Image: image}, err)
|
|
case "image.build":
|
|
params, err := rpc.DecodeParams[api.ImageBuildParams](req)
|
|
if err != nil {
|
|
return rpc.NewError("bad_request", err.Error())
|
|
}
|
|
image, err := d.BuildImage(ctx, params)
|
|
return marshalResultOrError(api.ImageShowResult{Image: image}, err)
|
|
case "image.delete":
|
|
params, err := rpc.DecodeParams[api.ImageRefParams](req)
|
|
if err != nil {
|
|
return rpc.NewError("bad_request", err.Error())
|
|
}
|
|
image, err := d.DeleteImage(ctx, params.IDOrName)
|
|
return marshalResultOrError(api.ImageShowResult{Image: image}, err)
|
|
default:
|
|
return rpc.NewError("unknown_method", req.Method)
|
|
}
|
|
}
|
|
|
|
func (d *Daemon) backgroundLoop() {
|
|
statsTicker := time.NewTicker(d.config.StatsPollInterval)
|
|
staleTicker := time.NewTicker(model.DefaultStaleSweepInterval)
|
|
defer statsTicker.Stop()
|
|
defer staleTicker.Stop()
|
|
for {
|
|
select {
|
|
case <-d.closing:
|
|
return
|
|
case <-statsTicker.C:
|
|
if err := d.pollStats(context.Background()); err != nil && d.logger != nil {
|
|
d.logger.Error("background stats poll failed", "error", err.Error())
|
|
}
|
|
case <-staleTicker.C:
|
|
if err := d.stopStaleVMs(context.Background()); err != nil && d.logger != nil {
|
|
d.logger.Error("background stale sweep failed", "error", err.Error())
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func (d *Daemon) ensureDefaultImage(ctx context.Context) error {
|
|
if d.config.DefaultImageName == "" {
|
|
return nil
|
|
}
|
|
if _, err := d.store.GetImageByName(ctx, d.config.DefaultImageName); err == nil {
|
|
if d.logger != nil {
|
|
d.logger.Debug("default image already registered", "image_name", d.config.DefaultImageName)
|
|
}
|
|
return nil
|
|
}
|
|
rootfs := d.config.DefaultRootfs
|
|
kernel := d.config.DefaultKernel
|
|
initrd := d.config.DefaultInitrd
|
|
if !exists(rootfs) || !exists(kernel) {
|
|
if d.logger != nil {
|
|
d.logger.Debug("default image skipped", "image_name", d.config.DefaultImageName, "rootfs_path", rootfs, "kernel_path", kernel)
|
|
}
|
|
return nil
|
|
}
|
|
id, err := model.NewID()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
now := model.Now()
|
|
image := model.Image{
|
|
ID: id,
|
|
Name: d.config.DefaultImageName,
|
|
Managed: false,
|
|
RootfsPath: rootfs,
|
|
KernelPath: kernel,
|
|
InitrdPath: initrd,
|
|
ModulesDir: d.config.DefaultModulesDir,
|
|
PackagesPath: d.config.DefaultPackagesFile,
|
|
Docker: strings.Contains(filepath.Base(rootfs), "docker"),
|
|
CreatedAt: now,
|
|
UpdatedAt: now,
|
|
}
|
|
if err := d.store.UpsertImage(ctx, image); err != nil {
|
|
return err
|
|
}
|
|
if d.logger != nil {
|
|
d.logger.Info("default image registered", append(imageLogAttrs(image), "managed", image.Managed)...)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (d *Daemon) reconcile(ctx context.Context) error {
|
|
op := d.beginOperation("daemon.reconcile")
|
|
vms, err := d.store.ListVMs(ctx)
|
|
if err != nil {
|
|
return op.fail(err)
|
|
}
|
|
for _, vm := range vms {
|
|
if vm.State != model.VMStateRunning {
|
|
continue
|
|
}
|
|
if system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
|
|
continue
|
|
}
|
|
op.stage("stale_vm", vmLogAttrs(vm)...)
|
|
_ = d.cleanupRuntime(ctx, vm, true)
|
|
vm.State = model.VMStateStopped
|
|
vm.Runtime.State = model.VMStateStopped
|
|
vm.Runtime.PID = 0
|
|
vm.Runtime.TapDevice = ""
|
|
vm.Runtime.APISockPath = ""
|
|
vm.Runtime.BaseLoop = ""
|
|
vm.Runtime.COWLoop = ""
|
|
vm.Runtime.DMName = ""
|
|
vm.Runtime.DMDev = ""
|
|
vm.UpdatedAt = model.Now()
|
|
if err := d.store.UpsertVM(ctx, vm); err != nil {
|
|
return op.fail(err, vmLogAttrs(vm)...)
|
|
}
|
|
}
|
|
op.done()
|
|
return nil
|
|
}
|
|
|
|
func (d *Daemon) FindVM(ctx context.Context, idOrName string) (model.VMRecord, error) {
|
|
if idOrName == "" {
|
|
return model.VMRecord{}, errors.New("vm id or name is required")
|
|
}
|
|
if vm, err := d.store.GetVM(ctx, idOrName); err == nil {
|
|
return vm, nil
|
|
}
|
|
vms, err := d.store.ListVMs(ctx)
|
|
if err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
matchCount := 0
|
|
var match model.VMRecord
|
|
for _, vm := range vms {
|
|
if strings.HasPrefix(vm.ID, idOrName) || strings.HasPrefix(vm.Name, idOrName) {
|
|
match = vm
|
|
matchCount++
|
|
}
|
|
}
|
|
if matchCount == 1 {
|
|
return match, nil
|
|
}
|
|
if matchCount > 1 {
|
|
return model.VMRecord{}, fmt.Errorf("multiple VMs match %q", idOrName)
|
|
}
|
|
return model.VMRecord{}, fmt.Errorf("vm %q not found", idOrName)
|
|
}
|
|
|
|
func (d *Daemon) FindImage(ctx context.Context, idOrName string) (model.Image, error) {
|
|
if idOrName == "" {
|
|
return model.Image{}, errors.New("image id or name is required")
|
|
}
|
|
if image, err := d.store.GetImageByName(ctx, idOrName); err == nil {
|
|
return image, nil
|
|
}
|
|
if image, err := d.store.GetImageByID(ctx, idOrName); err == nil {
|
|
return image, nil
|
|
}
|
|
images, err := d.store.ListImages(ctx)
|
|
if err != nil {
|
|
return model.Image{}, err
|
|
}
|
|
matchCount := 0
|
|
var match model.Image
|
|
for _, image := range images {
|
|
if strings.HasPrefix(image.ID, idOrName) || strings.HasPrefix(image.Name, idOrName) {
|
|
match = image
|
|
matchCount++
|
|
}
|
|
}
|
|
if matchCount == 1 {
|
|
return match, nil
|
|
}
|
|
if matchCount > 1 {
|
|
return model.Image{}, fmt.Errorf("multiple images match %q", idOrName)
|
|
}
|
|
return model.Image{}, fmt.Errorf("image %q not found", idOrName)
|
|
}
|
|
|
|
func (d *Daemon) TouchVM(ctx context.Context, idOrName string) (model.VMRecord, error) {
|
|
d.mu.Lock()
|
|
defer d.mu.Unlock()
|
|
vm, err := d.FindVM(ctx, idOrName)
|
|
if err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
system.TouchNow(&vm)
|
|
if err := d.store.UpsertVM(ctx, vm); err != nil {
|
|
return model.VMRecord{}, err
|
|
}
|
|
return vm, nil
|
|
}
|
|
|
|
func marshalResultOrError(v any, err error) rpc.Response {
|
|
if err != nil {
|
|
return rpc.NewError("operation_failed", err.Error())
|
|
}
|
|
resp, marshalErr := rpc.NewResult(v)
|
|
if marshalErr != nil {
|
|
return rpc.NewError("marshal_failed", marshalErr.Error())
|
|
}
|
|
return resp
|
|
}
|
|
|
|
func exists(path string) bool {
|
|
_, err := os.Stat(path)
|
|
return err == nil
|
|
}
|