daemon: thread per-RPC op_id end-to-end

Today there's no way to correlate a CLI failure with a daemon log line. operationLog records relative timing but no id, two concurrent vm.start calls log indistinguishably, and the async vmCreateOperationState.ID is user-facing yet never reaches the journal. The root helper logs plain text to stderr while bangerd logs JSON, so a merged journalctl is hard to grep across the trust-boundary split. Mint a per-RPC op id at dispatch entry, store it on context, and include it as an "op_id" attr on every operationLog record. The id is stamped onto every error response (including the early short-circuit paths bad_version and unknown_method). rpc.Call forwards the context op id on requests so a daemon RPC and the helper RPCs it triggers all share one id. The helper now logs JSON to match bangerd, adopts the inbound id, and emits a single "helper rpc completed" / "helper rpc failed" line per call so operators can see at a glance how long each privileged op took. vmCreateOperationState.ID is now the same id dispatch generated for vm.create.begin — one identifier between client status polls, daemon logs, and helper logs. The wire format gains two optional fields: rpc.Request.OpID and rpc.ErrorResponse.OpID, both omitempty so older peers (and the opposite direction) ignore them. ErrorResponse.Error() now appends "(op-XXXXXX)" to its string form when set; existing callers that just print err.Error() get the id for free. Tests cover: dispatch stamps op_id on unknown_method, bad_version, and handler-returned errors; rpc.Call exposes the typed *ErrorResponse via errors.As so the CLI can read code/op_id; ctx op_id is forwarded to the server in the request envelope. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-26 22:13:44 -03:00 · 2026-04-26 22:13:44 -03:00 · e47b8146dc
commit e47b8146dc
parent b8c48765fb
16 changed files with 333 additions and 44 deletions
--- a/internal/daemon/daemon.go
+++ b/internal/daemon/daemon.go
@ -310,17 +310,34 @@ func (d *Daemon) watchRequestDisconnect(conn net.Conn, reader *bufio.Reader, met
 }

 func (d *Daemon) dispatch(ctx context.Context, req rpc.Request) rpc.Response {
+	// Per-RPC correlation id is generated unconditionally — even
+	// errors that short-circuit before reaching a handler get one
+	// so the operator has a handle for every CLI failure.
+	// Generation can fail in theory (crypto/rand IO error) —
+	// degrade gracefully to a blank id rather than tearing down
+	// the request.
+	opID, _ := model.NewOpID()
+	if opID != "" {
+		ctx = WithOpID(ctx, opID)
+	}
+	stampOpID := func(resp rpc.Response) rpc.Response {
+		if !resp.OK && resp.Error != nil && resp.Error.OpID == "" && opID != "" {
+			resp.Error.OpID = opID
+		}
+		return resp
+	}
+
 	if req.Version != rpc.Version {
-		return rpc.NewError("bad_version", fmt.Sprintf("unsupported version %d", req.Version))
+		return stampOpID(rpc.NewError("bad_version", fmt.Sprintf("unsupported version %d", req.Version)))
 	}
 	if d.requestHandler != nil {
-		return d.requestHandler(ctx, req)
+		return stampOpID(d.requestHandler(ctx, req))
 	}
 	h, ok := rpcHandlers[req.Method]
 	if !ok {
-		return rpc.NewError("unknown_method", req.Method)
+		return stampOpID(rpc.NewError("unknown_method", req.Method))
 	}
-	return h(ctx, d, req)
+	return stampOpID(h(ctx, d, req))
 }

 func (d *Daemon) backgroundLoop() {
@ -346,7 +363,7 @@ func (d *Daemon) backgroundLoop() {
 }

 func (d *Daemon) reconcile(ctx context.Context) error {
-	op := d.beginOperation("daemon.reconcile")
+	op := d.beginOperation(ctx, "daemon.reconcile")
 	vms, err := d.store.ListVMs(ctx)
 	if err != nil {
 		return op.fail(err)
@ -441,14 +458,12 @@ func wireServices(d *Daemon) {
 	}
 	if d.img == nil {
 		d.img = newImageService(imageServiceDeps{
-			runner: d.runner,
-			logger: d.logger,
-			config: d.config,
-			layout: d.layout,
-			store:  d.store,
-			beginOperation: func(name string, attrs ...any) *operationLog {
-				return d.beginOperation(name, attrs...)
-			},
+			runner:         d.runner,
+			logger:         d.logger,
+			config:         d.config,
+			layout:         d.layout,
+			store:          d.store,
+			beginOperation: d.beginOperation,
 		})
 	}
 	if d.ws == nil {