daemon: thread per-RPC op_id end-to-end

Today there's no way to correlate a CLI failure with a daemon log
line. operationLog records relative timing but no id, two concurrent
vm.start calls log indistinguishably, and the async
vmCreateOperationState.ID is user-facing yet never reaches the
journal. The root helper logs plain text to stderr while bangerd
logs JSON, so a merged journalctl is hard to grep across the
trust-boundary split.

Mint a per-RPC op id at dispatch entry, store it on context, and
include it as an "op_id" attr on every operationLog record. The
id is stamped onto every error response (including the early
short-circuit paths bad_version and unknown_method). rpc.Call
forwards the context op id on requests so a daemon RPC and the
helper RPCs it triggers all share one id. The helper now logs
JSON to match bangerd, adopts the inbound id, and emits a single
"helper rpc completed" / "helper rpc failed" line per call so
operators can see at a glance how long each privileged op took.

vmCreateOperationState.ID is now the same id dispatch generated
for vm.create.begin — one identifier between client status polls,
daemon logs, and helper logs.

The wire format gains two optional fields: rpc.Request.OpID and
rpc.ErrorResponse.OpID, both omitempty so older peers (and the
opposite direction) ignore them. ErrorResponse.Error() now appends
"(op-XXXXXX)" to its string form when set; existing callers that
just print err.Error() get the id for free.

Tests cover: dispatch stamps op_id on unknown_method, bad_version,
and handler-returned errors; rpc.Call exposes the typed
*ErrorResponse via errors.As so the CLI can read code/op_id; ctx
op_id is forwarded to the server in the request envelope.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Thales Maciel 2026-04-26 22:13:44 -03:00
parent b8c48765fb
commit e47b8146dc
No known key found for this signature in database
GPG key ID: 33112E6833C34679
16 changed files with 333 additions and 44 deletions

View file

@ -310,17 +310,34 @@ func (d *Daemon) watchRequestDisconnect(conn net.Conn, reader *bufio.Reader, met
}
func (d *Daemon) dispatch(ctx context.Context, req rpc.Request) rpc.Response {
// Per-RPC correlation id is generated unconditionally — even
// errors that short-circuit before reaching a handler get one
// so the operator has a handle for every CLI failure.
// Generation can fail in theory (crypto/rand IO error) —
// degrade gracefully to a blank id rather than tearing down
// the request.
opID, _ := model.NewOpID()
if opID != "" {
ctx = WithOpID(ctx, opID)
}
stampOpID := func(resp rpc.Response) rpc.Response {
if !resp.OK && resp.Error != nil && resp.Error.OpID == "" && opID != "" {
resp.Error.OpID = opID
}
return resp
}
if req.Version != rpc.Version {
return rpc.NewError("bad_version", fmt.Sprintf("unsupported version %d", req.Version))
return stampOpID(rpc.NewError("bad_version", fmt.Sprintf("unsupported version %d", req.Version)))
}
if d.requestHandler != nil {
return d.requestHandler(ctx, req)
return stampOpID(d.requestHandler(ctx, req))
}
h, ok := rpcHandlers[req.Method]
if !ok {
return rpc.NewError("unknown_method", req.Method)
return stampOpID(rpc.NewError("unknown_method", req.Method))
}
return h(ctx, d, req)
return stampOpID(h(ctx, d, req))
}
func (d *Daemon) backgroundLoop() {
@ -346,7 +363,7 @@ func (d *Daemon) backgroundLoop() {
}
func (d *Daemon) reconcile(ctx context.Context) error {
op := d.beginOperation("daemon.reconcile")
op := d.beginOperation(ctx, "daemon.reconcile")
vms, err := d.store.ListVMs(ctx)
if err != nil {
return op.fail(err)
@ -441,14 +458,12 @@ func wireServices(d *Daemon) {
}
if d.img == nil {
d.img = newImageService(imageServiceDeps{
runner: d.runner,
logger: d.logger,
config: d.config,
layout: d.layout,
store: d.store,
beginOperation: func(name string, attrs ...any) *operationLog {
return d.beginOperation(name, attrs...)
},
runner: d.runner,
logger: d.logger,
config: d.config,
layout: d.layout,
store: d.store,
beginOperation: d.beginOperation,
})
}
if d.ws == nil {

View file

@ -1,8 +1,12 @@
package daemon
import (
"context"
"sort"
"strings"
"testing"
"banger/internal/rpc"
)
// TestRPCHandlersMatchDocumentedMethods pins the surface of the RPC
@ -82,3 +86,55 @@ func TestRPCHandlersAllNonNil(t *testing.T) {
}
}
}
// TestDispatchStampsOpIDOnError pins the contract that every error
// response leaving dispatch carries an op_id, even on the
// short-circuit paths (bad_version, unknown_method) that never
// reach a handler. Operators rely on this id to correlate a CLI
// failure to a daemon log line.
func TestDispatchStampsOpIDOnError(t *testing.T) {
d := &Daemon{}
t.Run("unknown_method", func(t *testing.T) {
resp := d.dispatch(context.Background(), rpc.Request{Version: rpc.Version, Method: "no.such.method"})
if resp.OK {
t.Fatalf("expected error response, got %+v", resp)
}
if resp.Error == nil || resp.Error.Code != "unknown_method" {
t.Fatalf("error = %+v, want unknown_method", resp.Error)
}
if !strings.HasPrefix(resp.Error.OpID, "op-") {
t.Fatalf("op_id = %q, want op-* prefix", resp.Error.OpID)
}
})
t.Run("bad_version", func(t *testing.T) {
resp := d.dispatch(context.Background(), rpc.Request{Version: rpc.Version + 99, Method: "ping"})
if resp.OK {
t.Fatalf("expected error response, got %+v", resp)
}
if resp.Error == nil || resp.Error.Code != "bad_version" {
t.Fatalf("error = %+v, want bad_version", resp.Error)
}
if !strings.HasPrefix(resp.Error.OpID, "op-") {
t.Fatalf("op_id = %q, want op-* prefix", resp.Error.OpID)
}
})
}
// TestDispatchPropagatesOpIDFromContext covers the case where a
// handler returns its own rpc.NewError with an empty op_id (most
// service errors do); the dispatch wrapper must stamp the
// dispatch-generated id on the way out.
func TestDispatchPropagatesOpIDFromContext(t *testing.T) {
d := &Daemon{
requestHandler: func(_ context.Context, _ rpc.Request) rpc.Response {
return rpc.NewError("operation_failed", "deliberate test failure")
},
}
resp := d.dispatch(context.Background(), rpc.Request{Version: rpc.Version, Method: "anything"})
if resp.OK || resp.Error == nil {
t.Fatalf("expected error response, got %+v", resp)
}
if !strings.HasPrefix(resp.Error.OpID, "op-") {
t.Fatalf("dispatch did not stamp op_id: %+v", resp.Error)
}
}

View file

@ -47,7 +47,7 @@ type ImageService struct {
// beginOperation is a test seam used by a couple of image ops that
// want structured operation logging. Nil → Daemon's beginOperation,
// injected at construction.
beginOperation func(name string, attrs ...any) *operationLog
beginOperation func(ctx context.Context, name string, attrs ...any) *operationLog
}
// imageServiceDeps names every handle ImageService needs from the
@ -59,7 +59,7 @@ type imageServiceDeps struct {
config model.DaemonConfig
layout paths.Layout
store *store.Store
beginOperation func(name string, attrs ...any) *operationLog
beginOperation func(ctx context.Context, name string, attrs ...any) *operationLog
}
func newImageService(deps imageServiceDeps) *ImageService {

View file

@ -98,7 +98,7 @@ func (s *ImageService) RegisterImage(ctx context.Context, params api.ImageRegist
// imageOpsMu — only the find/rename/upsert commit atom holds the
// lock.
func (s *ImageService) PromoteImage(ctx context.Context, idOrName string) (image model.Image, err error) {
op := s.beginOperation("image.promote")
op := s.beginOperation(ctx, "image.promote")
defer func() {
if err != nil {
op.fail(err, imageLogAttrs(image)...)

View file

@ -9,6 +9,7 @@ import (
"time"
"banger/internal/model"
"banger/internal/rpc"
)
func newDaemonLogger(w io.Writer, rawLevel string) (*slog.Logger, string, error) {
@ -35,9 +36,37 @@ func parseLogLevel(raw string) (slog.Level, string, error) {
}
}
func (d *Daemon) beginOperation(name string, attrs ...any) *operationLog {
// WithOpID stores the per-RPC correlation id on ctx. Re-exported
// from rpc so daemon-side call sites don't have to import rpc just
// for context plumbing. The dispatch layer calls this on every
// incoming request; capability hooks, lifecycle steps, and the
// privileged-ops shim that crosses into the root helper all read
// the id back via OpIDFromContext so a single id stitches the
// whole chain together in journalctl.
func WithOpID(ctx context.Context, opID string) context.Context {
return rpc.WithOpID(ctx, opID)
}
// OpIDFromContext returns the dispatch-assigned op id stored on
// ctx, or "" if none was set.
func OpIDFromContext(ctx context.Context) string {
return rpc.OpIDFromContext(ctx)
}
// beginOperation starts a logged operation. When ctx carries a
// dispatch-assigned op id (see WithOpID) every log line emitted
// through the returned operationLog includes it as an "op_id" attr,
// so the daemon journal can be greppable by id from the user's CLI
// error all the way down through capability hooks and the root
// helper.
func (d *Daemon) beginOperation(ctx context.Context, name string, attrs ...any) *operationLog {
opID := OpIDFromContext(ctx)
allAttrs := append([]any(nil), attrs...)
if opID != "" {
allAttrs = append([]any{"op_id", opID}, allAttrs...)
}
if d.logger != nil {
d.logger.Info("operation started", append([]any{"operation", name}, attrs...)...)
d.logger.Info("operation started", append([]any{"operation", name}, allAttrs...)...)
}
now := time.Now()
return &operationLog{
@ -45,7 +74,8 @@ func (d *Daemon) beginOperation(name string, attrs ...any) *operationLog {
name: name,
started: now,
last: now,
attrs: append([]any(nil), attrs...),
attrs: allAttrs,
opID: opID,
}
}
@ -55,6 +85,16 @@ type operationLog struct {
started time.Time
last time.Time
attrs []any
opID string
}
// OpID exposes the correlation id this operation was started with so
// dispatch can stamp it onto an outgoing error response.
func (o *operationLog) OpID() string {
if o == nil {
return ""
}
return o.opID
}
func (o *operationLog) stage(stage string, attrs ...any) {

View file

@ -39,7 +39,7 @@ type StatsService struct {
config model.DaemonConfig
store *store.Store
net *HostNetwork
beginOperation func(name string, attrs ...any) *operationLog
beginOperation func(ctx context.Context, name string, attrs ...any) *operationLog
// vmAlive / vmHandles are the minimum pair needed to answer "is
// this VM actually running right now?" + "what PID is it?".
@ -68,7 +68,7 @@ type statsServiceDeps struct {
config model.DaemonConfig
store *store.Store
net *HostNetwork
beginOperation func(name string, attrs ...any) *operationLog
beginOperation func(ctx context.Context, name string, attrs ...any) *operationLog
vmAlive func(vm model.VMRecord) bool
vmHandles func(vmID string) model.VMHandles
withVMLockByRef func(ctx context.Context, idOrName string, fn func(model.VMRecord) (model.VMRecord, error)) (model.VMRecord, error)
@ -189,7 +189,7 @@ func (s *StatsService) stopStaleVMs(ctx context.Context) (err error) {
if s.config.AutoStopStaleAfter <= 0 {
return nil
}
op := s.beginOperation("vm.stop_stale")
op := s.beginOperation(ctx, "vm.stop_stale")
defer func() {
if err != nil {
op.fail(err)

View file

@ -28,7 +28,7 @@ import (
// 3. Boot. Only the per-VM lock is held — parallel creates against
// different VMs fully overlap.
func (s *VMService) CreateVM(ctx context.Context, params api.VMCreateParams) (vm model.VMRecord, err error) {
op := s.beginOperation("vm.create")
op := s.beginOperation(ctx, "vm.create")
defer func() {
if err != nil {
op.fail(err)

View file

@ -24,10 +24,21 @@ type vmCreateOperationState struct {
op api.VMCreateOperation
}
func newVMCreateOperationState() (*vmCreateOperationState, error) {
id, err := model.NewID()
if err != nil {
return nil, err
// newVMCreateOperationState constructs the async-progress record for
// a vm.create.begin RPC. When the caller's context already carries a
// dispatch-assigned op id (the normal path), we reuse it so the
// operator-visible status id and the daemon-log op_id are the same
// string. Otherwise we mint a fresh op id — keeps the same shape on
// internal call sites that don't go through dispatch (tests, future
// background creators).
func newVMCreateOperationState(ctx context.Context) (*vmCreateOperationState, error) {
id := OpIDFromContext(ctx)
if id == "" {
var err error
id, err = model.NewOpID()
if err != nil {
return nil, err
}
}
now := model.Now()
return &vmCreateOperationState{
@ -146,12 +157,16 @@ func (op *vmCreateOperationState) cancelOperation() {
}
}
func (s *VMService) BeginVMCreate(_ context.Context, params api.VMCreateParams) (api.VMCreateOperation, error) {
op, err := newVMCreateOperationState()
func (s *VMService) BeginVMCreate(ctx context.Context, params api.VMCreateParams) (api.VMCreateOperation, error) {
op, err := newVMCreateOperationState(ctx)
if err != nil {
return api.VMCreateOperation{}, err
}
createCtx, cancel := context.WithCancel(context.Background())
// Detach from the caller's deadline (the begin RPC returns
// immediately) but preserve the op id so every log line emitted
// by the goroutine carries the same identifier the client just
// got back.
createCtx, cancel := context.WithCancel(WithOpID(context.Background(), op.op.ID))
op.setCancel(cancel)
s.createOps.Insert(op)
go s.runVMCreateOperation(withVMCreateProgress(createCtx, op), op, params)

View file

@ -30,7 +30,7 @@ func (s *VMService) StartVM(ctx context.Context, idOrName string) (model.VMRecor
}
func (s *VMService) startVMLocked(ctx context.Context, vm model.VMRecord, image model.Image) (_ model.VMRecord, err error) {
op := s.beginOperation("vm.start", append(vmLogAttrs(vm), imageLogAttrs(image)...)...)
op := s.beginOperation(ctx, "vm.start", append(vmLogAttrs(vm), imageLogAttrs(image)...)...)
defer func() {
if err != nil {
err = annotateLogPath(err, vm.Runtime.LogPath)
@ -97,7 +97,7 @@ func (s *VMService) StopVM(ctx context.Context, idOrName string) (model.VMRecord
func (s *VMService) stopVMLocked(ctx context.Context, current model.VMRecord) (vm model.VMRecord, err error) {
vm = current
op := s.beginOperation("vm.stop", "vm_ref", vm.ID)
op := s.beginOperation(ctx, "vm.stop", "vm_ref", vm.ID)
defer func() {
if err != nil {
op.fail(err, vmLogAttrs(vm)...)
@ -154,7 +154,7 @@ func (s *VMService) KillVM(ctx context.Context, params api.VMKillParams) (model.
func (s *VMService) killVMLocked(ctx context.Context, current model.VMRecord, signalValue string) (vm model.VMRecord, err error) {
vm = current
op := s.beginOperation("vm.kill", "vm_ref", vm.ID, "signal", signalValue)
op := s.beginOperation(ctx, "vm.kill", "vm_ref", vm.ID, "signal", signalValue)
defer func() {
if err != nil {
op.fail(err, vmLogAttrs(vm)...)
@ -209,7 +209,7 @@ func (s *VMService) killVMLocked(ctx context.Context, current model.VMRecord, si
}
func (s *VMService) RestartVM(ctx context.Context, idOrName string) (vm model.VMRecord, err error) {
op := s.beginOperation("vm.restart", "vm_ref", idOrName)
op := s.beginOperation(ctx, "vm.restart", "vm_ref", idOrName)
defer func() {
if err != nil {
op.fail(err, vmLogAttrs(vm)...)
@ -244,7 +244,7 @@ func (s *VMService) DeleteVM(ctx context.Context, idOrName string) (model.VMReco
func (s *VMService) deleteVMLocked(ctx context.Context, current model.VMRecord) (vm model.VMRecord, err error) {
vm = current
op := s.beginOperation("vm.delete", "vm_ref", vm.ID)
op := s.beginOperation(ctx, "vm.delete", "vm_ref", vm.ID)
defer func() {
if err != nil {
op.fail(err, vmLogAttrs(vm)...)

View file

@ -76,7 +76,7 @@ type VMService struct {
// VMService never reaches back to *Daemon.
capHooks capabilityHooks
beginOperation func(name string, attrs ...any) *operationLog
beginOperation func(ctx context.Context, name string, attrs ...any) *operationLog
}
// capabilityHooks bundles the capability-dispatch entry points that
@ -104,7 +104,7 @@ type vmServiceDeps struct {
ws *WorkspaceService
priv privilegedOps
capHooks capabilityHooks
beginOperation func(name string, attrs ...any) *operationLog
beginOperation func(ctx context.Context, name string, attrs ...any) *operationLog
vsockHostDevice string
}

View file

@ -17,7 +17,7 @@ func (s *VMService) SetVM(ctx context.Context, params api.VMSetParams) (model.VM
func (s *VMService) setVMLocked(ctx context.Context, current model.VMRecord, params api.VMSetParams) (vm model.VMRecord, err error) {
vm = current
op := s.beginOperation("vm.set", "vm_ref", vm.ID)
op := s.beginOperation(ctx, "vm.set", "vm_ref", vm.ID)
defer func() {
if err != nil {
op.fail(err, vmLogAttrs(vm)...)

View file

@ -43,7 +43,7 @@ type WorkspaceService struct {
imageWorkSeed func(ctx context.Context, image model.Image, fingerprint string) error
withVMLockByRef func(ctx context.Context, idOrName string, fn func(model.VMRecord) (model.VMRecord, error)) (model.VMRecord, error)
beginOperation func(name string, attrs ...any) *operationLog
beginOperation func(ctx context.Context, name string, attrs ...any) *operationLog
// repoInspector is the Inspector used by the real InspectRepo /
// ImportRepoToGuest fallbacks when the test seams below aren't
@ -71,7 +71,7 @@ type workspaceServiceDeps struct {
imageResolver func(ctx context.Context, idOrName string) (model.Image, error)
imageWorkSeed func(ctx context.Context, image model.Image, fingerprint string) error
withVMLockByRef func(ctx context.Context, idOrName string, fn func(model.VMRecord) (model.VMRecord, error)) (model.VMRecord, error)
beginOperation func(name string, attrs ...any) *operationLog
beginOperation func(ctx context.Context, name string, attrs ...any) *operationLog
}
func newWorkspaceService(deps workspaceServiceDeps) *WorkspaceService {

View file

@ -200,6 +200,21 @@ func NewID() (string, error) {
return hex.EncodeToString(buf), nil
}
// NewOpID returns a short identifier for tracing a single RPC
// operation across the daemon, the root helper, and the user-visible
// CLI error string. Format: "op-" + 12 hex chars (48 bits of entropy
// — collisions inside one daemon session are vanishingly unlikely
// and don't matter beyond it). Short enough to copy-paste from a
// CLI error into a journalctl --grep, long enough to actually
// disambiguate.
func NewOpID() (string, error) {
buf := make([]byte, 6)
if _, err := rand.Read(buf); err != nil {
return "", err
}
return "op-" + hex.EncodeToString(buf), nil
}
func ParseSize(raw string) (int64, error) {
if raw == "" {
return 0, errors.New("size is required")

View file

@ -285,7 +285,11 @@ func Open() (*Server, error) {
return &Server{
meta: meta,
runner: system.NewRunner(),
logger: slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelInfo})),
// JSON to match bangerd. Mixed text/JSON streams in the
// merged journalctl made the daemon side painful to grep;
// this aligns the helper so a single greppable shape spans
// both units.
logger: slog.New(slog.NewJSONHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelInfo})),
}, nil
}
@ -352,7 +356,29 @@ func (s *Server) handleConn(conn net.Conn) {
_ = json.NewEncoder(conn).Encode(rpc.NewError("bad_request", err.Error()))
return
}
resp := s.dispatch(context.Background(), req)
// Adopt the daemon's op id so a single greppable id covers the
// whole call chain (CLI → daemon → helper). Entry log at debug
// level keeps production quiet; the completion log fires at
// info-on-success / error-on-failure with duration so an
// operator can see at a glance how long each privileged op
// took.
ctx := rpc.WithOpID(context.Background(), req.OpID)
start := time.Now()
if s.logger != nil {
s.logger.Debug("helper rpc", "method", req.Method, "op_id", req.OpID)
}
resp := s.dispatch(ctx, req)
if !resp.OK && resp.Error != nil && resp.Error.OpID == "" && req.OpID != "" {
resp.Error.OpID = req.OpID
}
if s.logger != nil {
duration := time.Since(start).Milliseconds()
if !resp.OK && resp.Error != nil {
s.logger.Error("helper rpc failed", "method", req.Method, "op_id", req.OpID, "duration_ms", duration, "code", resp.Error.Code, "message", resp.Error.Message)
} else {
s.logger.Info("helper rpc completed", "method", req.Method, "op_id", req.OpID, "duration_ms", duration)
}
}
_ = json.NewEncoder(conn).Encode(resp)
}

View file

@ -18,6 +18,40 @@ type Request struct {
Version int `json:"version"`
Method string `json:"method"`
Params json.RawMessage `json:"params,omitempty"`
// OpID is the per-RPC correlation id. Optional on the wire so
// older clients (which don't set it) and older servers (which
// don't read it) keep interoperating. The daemon attaches it on
// every incoming request via dispatch; rpc.Call forwards
// whatever id is on ctx so a helper RPC carries the same id as
// the daemon RPC that triggered it.
OpID string `json:"op_id,omitempty"`
}
// opIDKey is the context-value key for the per-RPC correlation id
// that flows from CLI → daemon → root helper. Lives in the rpc
// package because rpc.Call needs to read it without depending on
// the daemon package; daemon and roothelper both import it.
type opIDKey struct{}
// WithOpID stores opID on ctx. Used by the daemon dispatch layer to
// inject the per-request id; rpc.Call picks it up automatically.
func WithOpID(ctx context.Context, opID string) context.Context {
if ctx == nil || opID == "" {
return ctx
}
return context.WithValue(ctx, opIDKey{}, opID)
}
// OpIDFromContext returns the op id stored on ctx by WithOpID, or
// "" if none was set.
func OpIDFromContext(ctx context.Context) string {
if ctx == nil {
return ""
}
if id, _ := ctx.Value(opIDKey{}).(string); id != "" {
return id
}
return ""
}
type Response struct {
@ -29,6 +63,29 @@ type Response struct {
type ErrorResponse struct {
Code string `json:"code"`
Message string `json:"message"`
// OpID is the daemon-assigned correlation id for the RPC that
// produced this error. Optional and may be empty (older daemons
// don't set it); when present the CLI surfaces it so an operator
// can grep journalctl by that id and find the full context.
OpID string `json:"op_id,omitempty"`
}
// Error makes ErrorResponse satisfy the error interface so callers
// can errors.As it out of an rpc.Call return value and read the
// structured fields directly. The default string form is
// "code: message (op-id)" — the op id only appears when the daemon
// attached one. CLI code paths that want a translated, user-facing
// message render the typed fields themselves; this fallback is for
// log lines, fmt.Errorf %w wrappers, and any caller that hasn't
// bothered to errors.As yet.
func (e *ErrorResponse) Error() string {
if e == nil {
return ""
}
if e.OpID == "" {
return e.Code + ": " + e.Message
}
return e.Code + ": " + e.Message + " (" + e.OpID + ")"
}
func NewResult(v any) (Response, error) {
@ -43,6 +100,12 @@ func NewError(code, message string) Response {
return Response{OK: false, Error: &ErrorResponse{Code: code, Message: message}}
}
// NewErrorWithOpID is the variant for daemon dispatch sites that have
// resolved an op id by the time they encode the response.
func NewErrorWithOpID(code, message, opID string) Response {
return Response{OK: false, Error: &ErrorResponse{Code: code, Message: message, OpID: opID}}
}
func DecodeParams[T any](req Request) (T, error) {
var zero T
if len(req.Params) == 0 {
@ -78,7 +141,7 @@ func Call[T any](ctx context.Context, socketPath, method string, params any) (T,
_ = conn.SetDeadline(deadline)
}
request := Request{Version: Version, Method: method}
request := Request{Version: Version, Method: method, OpID: OpIDFromContext(ctx)}
if params != nil {
raw, err := json.Marshal(params)
if err != nil {
@ -105,7 +168,10 @@ func Call[T any](ctx context.Context, socketPath, method string, params any) (T,
if response.Error == nil {
return zero, errors.New("rpc error")
}
return zero, fmt.Errorf("%s: %s", response.Error.Code, response.Error.Message)
// Return the typed error directly so callers that need code
// or op_id can errors.As it out. err.Error() format is
// preserved for callers that only print the message.
return zero, response.Error
}
if len(response.Result) == 0 {
return zero, nil

View file

@ -92,6 +92,62 @@ func TestCallReturnsRemoteError(t *testing.T) {
}
}
func TestCallExposesTypedErrorWithOpID(t *testing.T) {
t.Parallel()
socketPath, cleanup := serveRPCOnce(t, func(conn net.Conn) {
defer conn.Close()
var req Request
if err := json.NewDecoder(bufio.NewReader(conn)).Decode(&req); err != nil {
t.Fatalf("decode request: %v", err)
}
if err := json.NewEncoder(conn).Encode(NewErrorWithOpID("not_found", "vm \"foo\" not found", "op-deadbeef00ff")); err != nil {
t.Fatalf("encode error response: %v", err)
}
})
defer cleanup()
_, err := Call[map[string]string](context.Background(), socketPath, "vm.show", nil)
if err == nil {
t.Fatal("Call() returned nil error")
}
var rpcErr *ErrorResponse
if !errors.As(err, &rpcErr) {
t.Fatalf("Call() error %T (%v) is not *ErrorResponse — CLI cannot read the op_id", err, err)
}
if rpcErr.Code != "not_found" || rpcErr.OpID != "op-deadbeef00ff" {
t.Fatalf("typed error = %+v, want code=not_found op-deadbeef00ff", rpcErr)
}
// String form keeps the op_id in parens so callers that only
// log err.Error() still surface the id.
if got := rpcErr.Error(); !strings.Contains(got, "(op-deadbeef00ff)") {
t.Fatalf("err.Error() = %q, want op-id suffix", got)
}
}
func TestCallForwardsOpIDFromContext(t *testing.T) {
t.Parallel()
var seenReq Request
socketPath, cleanup := serveRPCOnce(t, func(conn net.Conn) {
defer conn.Close()
if err := json.NewDecoder(bufio.NewReader(conn)).Decode(&seenReq); err != nil {
t.Fatalf("decode request: %v", err)
}
resp, _ := NewResult(map[string]string{"status": "ok"})
_ = json.NewEncoder(conn).Encode(resp)
})
defer cleanup()
ctx := WithOpID(context.Background(), "op-cafef00d1234")
if _, err := Call[map[string]string](ctx, socketPath, "ping", nil); err != nil {
t.Fatalf("Call: %v", err)
}
if seenReq.OpID != "op-cafef00d1234" {
t.Fatalf("server saw op_id = %q, want op-cafef00d1234", seenReq.OpID)
}
}
func TestCallRejectsMalformedResponse(t *testing.T) {
t.Parallel()