daemon: thread per-RPC op_id end-to-end
Today there's no way to correlate a CLI failure with a daemon log line. operationLog records relative timing but no id, two concurrent vm.start calls log indistinguishably, and the async vmCreateOperationState.ID is user-facing yet never reaches the journal. The root helper logs plain text to stderr while bangerd logs JSON, so a merged journalctl is hard to grep across the trust-boundary split. Mint a per-RPC op id at dispatch entry, store it on context, and include it as an "op_id" attr on every operationLog record. The id is stamped onto every error response (including the early short-circuit paths bad_version and unknown_method). rpc.Call forwards the context op id on requests so a daemon RPC and the helper RPCs it triggers all share one id. The helper now logs JSON to match bangerd, adopts the inbound id, and emits a single "helper rpc completed" / "helper rpc failed" line per call so operators can see at a glance how long each privileged op took. vmCreateOperationState.ID is now the same id dispatch generated for vm.create.begin — one identifier between client status polls, daemon logs, and helper logs. The wire format gains two optional fields: rpc.Request.OpID and rpc.ErrorResponse.OpID, both omitempty so older peers (and the opposite direction) ignore them. ErrorResponse.Error() now appends "(op-XXXXXX)" to its string form when set; existing callers that just print err.Error() get the id for free. Tests cover: dispatch stamps op_id on unknown_method, bad_version, and handler-returned errors; rpc.Call exposes the typed *ErrorResponse via errors.As so the CLI can read code/op_id; ctx op_id is forwarded to the server in the request envelope. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
b8c48765fb
commit
e47b8146dc
16 changed files with 333 additions and 44 deletions
|
|
@ -9,6 +9,7 @@ import (
|
|||
"time"
|
||||
|
||||
"banger/internal/model"
|
||||
"banger/internal/rpc"
|
||||
)
|
||||
|
||||
func newDaemonLogger(w io.Writer, rawLevel string) (*slog.Logger, string, error) {
|
||||
|
|
@ -35,9 +36,37 @@ func parseLogLevel(raw string) (slog.Level, string, error) {
|
|||
}
|
||||
}
|
||||
|
||||
func (d *Daemon) beginOperation(name string, attrs ...any) *operationLog {
|
||||
// WithOpID stores the per-RPC correlation id on ctx. Re-exported
|
||||
// from rpc so daemon-side call sites don't have to import rpc just
|
||||
// for context plumbing. The dispatch layer calls this on every
|
||||
// incoming request; capability hooks, lifecycle steps, and the
|
||||
// privileged-ops shim that crosses into the root helper all read
|
||||
// the id back via OpIDFromContext so a single id stitches the
|
||||
// whole chain together in journalctl.
|
||||
func WithOpID(ctx context.Context, opID string) context.Context {
|
||||
return rpc.WithOpID(ctx, opID)
|
||||
}
|
||||
|
||||
// OpIDFromContext returns the dispatch-assigned op id stored on
|
||||
// ctx, or "" if none was set.
|
||||
func OpIDFromContext(ctx context.Context) string {
|
||||
return rpc.OpIDFromContext(ctx)
|
||||
}
|
||||
|
||||
// beginOperation starts a logged operation. When ctx carries a
|
||||
// dispatch-assigned op id (see WithOpID) every log line emitted
|
||||
// through the returned operationLog includes it as an "op_id" attr,
|
||||
// so the daemon journal can be greppable by id from the user's CLI
|
||||
// error all the way down through capability hooks and the root
|
||||
// helper.
|
||||
func (d *Daemon) beginOperation(ctx context.Context, name string, attrs ...any) *operationLog {
|
||||
opID := OpIDFromContext(ctx)
|
||||
allAttrs := append([]any(nil), attrs...)
|
||||
if opID != "" {
|
||||
allAttrs = append([]any{"op_id", opID}, allAttrs...)
|
||||
}
|
||||
if d.logger != nil {
|
||||
d.logger.Info("operation started", append([]any{"operation", name}, attrs...)...)
|
||||
d.logger.Info("operation started", append([]any{"operation", name}, allAttrs...)...)
|
||||
}
|
||||
now := time.Now()
|
||||
return &operationLog{
|
||||
|
|
@ -45,7 +74,8 @@ func (d *Daemon) beginOperation(name string, attrs ...any) *operationLog {
|
|||
name: name,
|
||||
started: now,
|
||||
last: now,
|
||||
attrs: append([]any(nil), attrs...),
|
||||
attrs: allAttrs,
|
||||
opID: opID,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -55,6 +85,16 @@ type operationLog struct {
|
|||
started time.Time
|
||||
last time.Time
|
||||
attrs []any
|
||||
opID string
|
||||
}
|
||||
|
||||
// OpID exposes the correlation id this operation was started with so
|
||||
// dispatch can stamp it onto an outgoing error response.
|
||||
func (o *operationLog) OpID() string {
|
||||
if o == nil {
|
||||
return ""
|
||||
}
|
||||
return o.opID
|
||||
}
|
||||
|
||||
func (o *operationLog) stage(stage string, attrs ...any) {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue