daemon: thread per-RPC op_id end-to-end

Today there's no way to correlate a CLI failure with a daemon log line. operationLog records relative timing but no id, two concurrent vm.start calls log indistinguishably, and the async vmCreateOperationState.ID is user-facing yet never reaches the journal. The root helper logs plain text to stderr while bangerd logs JSON, so a merged journalctl is hard to grep across the trust-boundary split. Mint a per-RPC op id at dispatch entry, store it on context, and include it as an "op_id" attr on every operationLog record. The id is stamped onto every error response (including the early short-circuit paths bad_version and unknown_method). rpc.Call forwards the context op id on requests so a daemon RPC and the helper RPCs it triggers all share one id. The helper now logs JSON to match bangerd, adopts the inbound id, and emits a single "helper rpc completed" / "helper rpc failed" line per call so operators can see at a glance how long each privileged op took. vmCreateOperationState.ID is now the same id dispatch generated for vm.create.begin — one identifier between client status polls, daemon logs, and helper logs. The wire format gains two optional fields: rpc.Request.OpID and rpc.ErrorResponse.OpID, both omitempty so older peers (and the opposite direction) ignore them. ErrorResponse.Error() now appends "(op-XXXXXX)" to its string form when set; existing callers that just print err.Error() get the id for free. Tests cover: dispatch stamps op_id on unknown_method, bad_version, and handler-returned errors; rpc.Call exposes the typed *ErrorResponse via errors.As so the CLI can read code/op_id; ctx op_id is forwarded to the server in the request envelope. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-26 22:13:44 -03:00 · 2026-04-26 22:13:44 -03:00 · e47b8146dc
commit e47b8146dc
parent b8c48765fb
16 changed files with 333 additions and 44 deletions
--- a/internal/rpc/rpc_test.go
+++ b/internal/rpc/rpc_test.go
@ -92,6 +92,62 @@ func TestCallReturnsRemoteError(t *testing.T) {
 	}
 }

+func TestCallExposesTypedErrorWithOpID(t *testing.T) {
+	t.Parallel()
+
+	socketPath, cleanup := serveRPCOnce(t, func(conn net.Conn) {
+		defer conn.Close()
+		var req Request
+		if err := json.NewDecoder(bufio.NewReader(conn)).Decode(&req); err != nil {
+			t.Fatalf("decode request: %v", err)
+		}
+		if err := json.NewEncoder(conn).Encode(NewErrorWithOpID("not_found", "vm \"foo\" not found", "op-deadbeef00ff")); err != nil {
+			t.Fatalf("encode error response: %v", err)
+		}
+	})
+	defer cleanup()
+
+	_, err := Call[map[string]string](context.Background(), socketPath, "vm.show", nil)
+	if err == nil {
+		t.Fatal("Call() returned nil error")
+	}
+	var rpcErr *ErrorResponse
+	if !errors.As(err, &rpcErr) {
+		t.Fatalf("Call() error %T (%v) is not *ErrorResponse — CLI cannot read the op_id", err, err)
+	}
+	if rpcErr.Code != "not_found" || rpcErr.OpID != "op-deadbeef00ff" {
+		t.Fatalf("typed error = %+v, want code=not_found op-deadbeef00ff", rpcErr)
+	}
+	// String form keeps the op_id in parens so callers that only
+	// log err.Error() still surface the id.
+	if got := rpcErr.Error(); !strings.Contains(got, "(op-deadbeef00ff)") {
+		t.Fatalf("err.Error() = %q, want op-id suffix", got)
+	}
+}
+
+func TestCallForwardsOpIDFromContext(t *testing.T) {
+	t.Parallel()
+
+	var seenReq Request
+	socketPath, cleanup := serveRPCOnce(t, func(conn net.Conn) {
+		defer conn.Close()
+		if err := json.NewDecoder(bufio.NewReader(conn)).Decode(&seenReq); err != nil {
+			t.Fatalf("decode request: %v", err)
+		}
+		resp, _ := NewResult(map[string]string{"status": "ok"})
+		_ = json.NewEncoder(conn).Encode(resp)
+	})
+	defer cleanup()
+
+	ctx := WithOpID(context.Background(), "op-cafef00d1234")
+	if _, err := Call[map[string]string](ctx, socketPath, "ping", nil); err != nil {
+		t.Fatalf("Call: %v", err)
+	}
+	if seenReq.OpID != "op-cafef00d1234" {
+		t.Fatalf("server saw op_id = %q, want op-cafef00d1234", seenReq.OpID)
+	}
+}
+
 func TestCallRejectsMalformedResponse(t *testing.T) {
 	t.Parallel()