From 3aa64a63c15db6d287802db9f40c179dd27409ff Mon Sep 17 00:00:00 2001 From: Thales Maciel Date: Sat, 18 Apr 2026 15:59:27 -0300 Subject: [PATCH] vm run: bound the ssh wait and give a useful error on timeout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before: `guestWaitForSSHFunc` loops forever bounded only by context cancellation, so if sshd fails to start in the guest `vm run` hangs indefinitely — which burned a long debugging session during the golden-image bring-up. After: the ssh wait gets its own 90s deadline. On guest-side timeout the error names the VM, explains sshd is the likely suspect, points at `banger vm logs ` for the console output, and notes the VM is still alive for inspection (or `vm delete` to clean up). Parent context cancellation (Ctrl-C, caller timeout) still surfaces as-is without the hint. `vmRunSSHTimeout` is a var rather than a const so tests can shrink it; the new TestRunVMRunSSHTimeoutReturnsActionableError sets it to 50ms and asserts the error message contains the actionable bits. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/cli/banger.go | 29 ++++++++++++++++++++-- internal/cli/cli_test.go | 52 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+), 2 deletions(-) diff --git a/internal/cli/banger.go b/internal/cli/banger.go index 304b935..e5e2e5a 100644 --- a/internal/cli/banger.go +++ b/internal/cli/banger.go @@ -160,6 +160,15 @@ const vmRunShallowFetchDepth = 10 const vmRunToolingInstallTimeoutSeconds = 120 +// vmRunSSHTimeout bounds how long `vm run` waits for guest ssh after +// the vsock agent is ready. vsock readiness already means systemd +// reached the banger-vsock-agent unit in multi-user.target, so sshd +// should be up within seconds; a minute plus change is generous +// headroom for a slow first boot while still short enough that a +// wedged sshd surfaces promptly instead of hanging forever. Var, not +// const, so tests can shrink it. +var vmRunSSHTimeout = 90 * time.Second + func NewBangerCommand() *cobra.Command { root := &cobra.Command{ Use: "banger", @@ -2797,9 +2806,25 @@ func runVMRun(ctx context.Context, socketPath string, cfg model.DaemonConfig, st } sshAddress := net.JoinHostPort(vm.Runtime.GuestIP, "22") progress.render("waiting for guest ssh") - if err := guestWaitForSSHFunc(ctx, sshAddress, cfg.SSHKeyPath, 250*time.Millisecond); err != nil { - return fmt.Errorf("vm %q is running but guest ssh is unavailable: %w", vmRef, err) + sshCtx, cancelSSH := context.WithTimeout(ctx, vmRunSSHTimeout) + if err := guestWaitForSSHFunc(sshCtx, sshAddress, cfg.SSHKeyPath, 250*time.Millisecond); err != nil { + cancelSSH() + // Surface parent-context cancellation (Ctrl-C, caller + // timeout) as-is. Only the guest-side timeout needs the + // actionable hint. + if errors.Is(ctx.Err(), context.Canceled) || errors.Is(ctx.Err(), context.DeadlineExceeded) { + return fmt.Errorf("vm %q: %w", vmRef, ctx.Err()) + } + return fmt.Errorf( + "vm %q is running but guest ssh did not come up within %s. "+ + "sshd is the likely suspect — inspect the guest console with "+ + "`banger vm logs %s` (look for `Failed to start ssh.service`). "+ + "The VM is still alive; leave it for inspection or remove with `banger vm delete %s`. "+ + "underlying error: %w", + vmRef, vmRunSSHTimeout, vmRef, vmRef, err, + ) } + cancelSSH() if spec != nil { progress.render("preparing guest workspace") if _, err := vmWorkspacePrepareFunc(ctx, socketPath, api.VMWorkspacePrepareParams{ diff --git a/internal/cli/cli_test.go b/internal/cli/cli_test.go index 068b1ec..f2f08cc 100644 --- a/internal/cli/cli_test.go +++ b/internal/cli/cli_test.go @@ -1616,6 +1616,58 @@ func TestRunVMRunBareModeSkipsWorkspaceAndTooling(t *testing.T) { } } +func TestRunVMRunSSHTimeoutReturnsActionableError(t *testing.T) { + origBegin := vmCreateBeginFunc + origWaitForSSH := guestWaitForSSHFunc + origTimeout := vmRunSSHTimeout + vmRunSSHTimeout = 50 * time.Millisecond + t.Cleanup(func() { + vmCreateBeginFunc = origBegin + guestWaitForSSHFunc = origWaitForSSH + vmRunSSHTimeout = origTimeout + }) + + vm := model.VMRecord{ + ID: "vm-id", Name: "slowvm", + Runtime: model.VMRuntime{State: model.VMStateRunning, GuestIP: "172.16.0.2"}, + } + vmCreateBeginFunc = func(context.Context, string, api.VMCreateParams) (api.VMCreateBeginResult, error) { + return api.VMCreateBeginResult{Operation: api.VMCreateOperation{ID: "op-1", Stage: "ready", Done: true, Success: true, VM: &vm}}, nil + } + // Simulate the guest never bringing sshd up — the wait-for-ssh + // child context fires its deadline, returning a DeadlineExceeded. + guestWaitForSSHFunc = func(ctx context.Context, _, _ string, _ time.Duration) error { + <-ctx.Done() + return ctx.Err() + } + + var stdout, stderr bytes.Buffer + err := runVMRun( + context.Background(), + "/tmp/bangerd.sock", + model.DaemonConfig{SSHKeyPath: "/tmp/id_ed25519"}, + strings.NewReader(""), + &stdout, &stderr, + api.VMCreateParams{Name: "slowvm"}, + nil, + nil, + ) + if err == nil { + t.Fatal("want timeout error") + } + msg := err.Error() + for _, want := range []string{ + "slowvm", + "did not come up", + "banger vm logs slowvm", + "banger vm delete slowvm", + } { + if !strings.Contains(msg, want) { + t.Fatalf("err = %q, want contains %q", msg, want) + } + } +} + func TestRunVMRunCommandModePropagatesExitCode(t *testing.T) { origBegin := vmCreateBeginFunc origWaitForSSH := guestWaitForSSHFunc