vm run: bound the ssh wait and give a useful error on timeout

Before: `guestWaitForSSHFunc` loops forever bounded only by context
cancellation, so if sshd fails to start in the guest `vm run` hangs
indefinitely — which burned a long debugging session during the
golden-image bring-up.

After: the ssh wait gets its own 90s deadline. On guest-side timeout
the error names the VM, explains sshd is the likely suspect, points
at `banger vm logs <name>` for the console output, and notes the VM
is still alive for inspection (or `vm delete` to clean up). Parent
context cancellation (Ctrl-C, caller timeout) still surfaces as-is
without the hint.

`vmRunSSHTimeout` is a var rather than a const so tests can shrink
it; the new TestRunVMRunSSHTimeoutReturnsActionableError sets it to
50ms and asserts the error message contains the actionable bits.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Thales Maciel 2026-04-18 15:59:27 -03:00
parent ac7974f5b9
commit 3aa64a63c1
No known key found for this signature in database
GPG key ID: 33112E6833C34679
2 changed files with 79 additions and 2 deletions

View file

@ -160,6 +160,15 @@ const vmRunShallowFetchDepth = 10
const vmRunToolingInstallTimeoutSeconds = 120
// vmRunSSHTimeout bounds how long `vm run` waits for guest ssh after
// the vsock agent is ready. vsock readiness already means systemd
// reached the banger-vsock-agent unit in multi-user.target, so sshd
// should be up within seconds; a minute plus change is generous
// headroom for a slow first boot while still short enough that a
// wedged sshd surfaces promptly instead of hanging forever. Var, not
// const, so tests can shrink it.
var vmRunSSHTimeout = 90 * time.Second
func NewBangerCommand() *cobra.Command {
root := &cobra.Command{
Use: "banger",
@ -2797,9 +2806,25 @@ func runVMRun(ctx context.Context, socketPath string, cfg model.DaemonConfig, st
}
sshAddress := net.JoinHostPort(vm.Runtime.GuestIP, "22")
progress.render("waiting for guest ssh")
if err := guestWaitForSSHFunc(ctx, sshAddress, cfg.SSHKeyPath, 250*time.Millisecond); err != nil {
return fmt.Errorf("vm %q is running but guest ssh is unavailable: %w", vmRef, err)
sshCtx, cancelSSH := context.WithTimeout(ctx, vmRunSSHTimeout)
if err := guestWaitForSSHFunc(sshCtx, sshAddress, cfg.SSHKeyPath, 250*time.Millisecond); err != nil {
cancelSSH()
// Surface parent-context cancellation (Ctrl-C, caller
// timeout) as-is. Only the guest-side timeout needs the
// actionable hint.
if errors.Is(ctx.Err(), context.Canceled) || errors.Is(ctx.Err(), context.DeadlineExceeded) {
return fmt.Errorf("vm %q: %w", vmRef, ctx.Err())
}
return fmt.Errorf(
"vm %q is running but guest ssh did not come up within %s. "+
"sshd is the likely suspect — inspect the guest console with "+
"`banger vm logs %s` (look for `Failed to start ssh.service`). "+
"The VM is still alive; leave it for inspection or remove with `banger vm delete %s`. "+
"underlying error: %w",
vmRef, vmRunSSHTimeout, vmRef, vmRef, err,
)
}
cancelSSH()
if spec != nil {
progress.render("preparing guest workspace")
if _, err := vmWorkspacePrepareFunc(ctx, socketPath, api.VMWorkspacePrepareParams{