banger/internal/daemon/logger.go
Thales Maciel e47b8146dc
daemon: thread per-RPC op_id end-to-end
Today there's no way to correlate a CLI failure with a daemon log
line. operationLog records relative timing but no id, two concurrent
vm.start calls log indistinguishably, and the async
vmCreateOperationState.ID is user-facing yet never reaches the
journal. The root helper logs plain text to stderr while bangerd
logs JSON, so a merged journalctl is hard to grep across the
trust-boundary split.

Mint a per-RPC op id at dispatch entry, store it on context, and
include it as an "op_id" attr on every operationLog record. The
id is stamped onto every error response (including the early
short-circuit paths bad_version and unknown_method). rpc.Call
forwards the context op id on requests so a daemon RPC and the
helper RPCs it triggers all share one id. The helper now logs
JSON to match bangerd, adopts the inbound id, and emits a single
"helper rpc completed" / "helper rpc failed" line per call so
operators can see at a glance how long each privileged op took.

vmCreateOperationState.ID is now the same id dispatch generated
for vm.create.begin — one identifier between client status polls,
daemon logs, and helper logs.

The wire format gains two optional fields: rpc.Request.OpID and
rpc.ErrorResponse.OpID, both omitempty so older peers (and the
opposite direction) ignore them. ErrorResponse.Error() now appends
"(op-XXXXXX)" to its string form when set; existing callers that
just print err.Error() get the id for free.

Tests cover: dispatch stamps op_id on unknown_method, bad_version,
and handler-returned errors; rpc.Call exposes the typed
*ErrorResponse via errors.As so the CLI can read code/op_id; ctx
op_id is forwarded to the server in the request envelope.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-26 22:13:44 -03:00

185 lines
5.2 KiB
Go

package daemon
import (
"context"
"fmt"
"io"
"log/slog"
"strings"
"time"
"banger/internal/model"
"banger/internal/rpc"
)
func newDaemonLogger(w io.Writer, rawLevel string) (*slog.Logger, string, error) {
level, normalized, err := parseLogLevel(rawLevel)
if err != nil {
return nil, "", err
}
logger := slog.New(slog.NewJSONHandler(w, &slog.HandlerOptions{Level: level}))
return logger, normalized, nil
}
func parseLogLevel(raw string) (slog.Level, string, error) {
switch strings.ToLower(strings.TrimSpace(raw)) {
case "", "info":
return slog.LevelInfo, "info", nil
case "debug":
return slog.LevelDebug, "debug", nil
case "warn", "warning":
return slog.LevelWarn, "warn", nil
case "error":
return slog.LevelError, "error", nil
default:
return 0, "", fmt.Errorf("unsupported log_level %q (use debug, info, warn, or error)", raw)
}
}
// WithOpID stores the per-RPC correlation id on ctx. Re-exported
// from rpc so daemon-side call sites don't have to import rpc just
// for context plumbing. The dispatch layer calls this on every
// incoming request; capability hooks, lifecycle steps, and the
// privileged-ops shim that crosses into the root helper all read
// the id back via OpIDFromContext so a single id stitches the
// whole chain together in journalctl.
func WithOpID(ctx context.Context, opID string) context.Context {
return rpc.WithOpID(ctx, opID)
}
// OpIDFromContext returns the dispatch-assigned op id stored on
// ctx, or "" if none was set.
func OpIDFromContext(ctx context.Context) string {
return rpc.OpIDFromContext(ctx)
}
// beginOperation starts a logged operation. When ctx carries a
// dispatch-assigned op id (see WithOpID) every log line emitted
// through the returned operationLog includes it as an "op_id" attr,
// so the daemon journal can be greppable by id from the user's CLI
// error all the way down through capability hooks and the root
// helper.
func (d *Daemon) beginOperation(ctx context.Context, name string, attrs ...any) *operationLog {
opID := OpIDFromContext(ctx)
allAttrs := append([]any(nil), attrs...)
if opID != "" {
allAttrs = append([]any{"op_id", opID}, allAttrs...)
}
if d.logger != nil {
d.logger.Info("operation started", append([]any{"operation", name}, allAttrs...)...)
}
now := time.Now()
return &operationLog{
logger: d.logger,
name: name,
started: now,
last: now,
attrs: allAttrs,
opID: opID,
}
}
type operationLog struct {
logger *slog.Logger
name string
started time.Time
last time.Time
attrs []any
opID string
}
// OpID exposes the correlation id this operation was started with so
// dispatch can stamp it onto an outgoing error response.
func (o *operationLog) OpID() string {
if o == nil {
return ""
}
return o.opID
}
func (o *operationLog) stage(stage string, attrs ...any) {
now := time.Now()
o.log(slog.LevelInfo, "operation stage", append([]any{
"stage", stage,
"since_start_ms", now.Sub(o.started).Milliseconds(),
"since_last_stage_ms", now.Sub(o.last).Milliseconds(),
}, attrs...)...)
o.last = now
}
func (o *operationLog) debugStage(stage string, attrs ...any) {
now := time.Now()
o.log(slog.LevelDebug, "operation stage", append([]any{
"stage", stage,
"since_start_ms", now.Sub(o.started).Milliseconds(),
"since_last_stage_ms", now.Sub(o.last).Milliseconds(),
}, attrs...)...)
o.last = now
}
func (o *operationLog) done(attrs ...any) {
o.log(slog.LevelInfo, "operation completed", append([]any{"duration_ms", time.Since(o.started).Milliseconds()}, attrs...)...)
}
func (o *operationLog) fail(err error, attrs ...any) error {
if err == nil {
return nil
}
o.log(slog.LevelError, "operation failed", append([]any{"duration_ms", time.Since(o.started).Milliseconds(), "error", err.Error()}, attrs...)...)
return err
}
func (o operationLog) log(level slog.Level, msg string, attrs ...any) {
if o.logger == nil {
return
}
base := append([]any{"operation", o.name}, o.attrs...)
base = append(base, attrs...)
o.logger.Log(context.Background(), level, msg, base...)
}
// vmLogAttrs returns the durable identifying fields for a VM that
// are always safe to log. Transient handles (PID, tap device) moved
// off VMRecord when the schema was split; lifecycle ops log those
// explicitly on the events where they matter (e.g. wait_for_exit).
func vmLogAttrs(vm model.VMRecord) []any {
attrs := []any{
"vm_id", vm.ID,
"vm_name", vm.Name,
"image_id", vm.ImageID,
}
if vm.Runtime.GuestIP != "" {
attrs = append(attrs, "guest_ip", vm.Runtime.GuestIP)
}
if vm.Runtime.APISockPath != "" {
attrs = append(attrs, "api_socket", vm.Runtime.APISockPath)
}
if vm.Runtime.LogPath != "" {
attrs = append(attrs, "log_path", vm.Runtime.LogPath)
}
return attrs
}
func imageLogAttrs(image model.Image) []any {
attrs := []any{
"image_id", image.ID,
"image_name", image.Name,
}
if image.ArtifactDir != "" {
attrs = append(attrs, "artifact_dir", image.ArtifactDir)
}
if image.RootfsPath != "" {
attrs = append(attrs, "rootfs_path", image.RootfsPath)
}
if image.WorkSeedPath != "" {
attrs = append(attrs, "work_seed_path", image.WorkSeedPath)
}
return attrs
}
func annotateLogPath(err error, path string) error {
if err == nil || strings.TrimSpace(path) == "" {
return err
}
return fmt.Errorf("%w; inspect %s", err, path)
}