vm run: bound the ssh wait and give a useful error on timeout
Before: `guestWaitForSSHFunc` loops forever bounded only by context cancellation, so if sshd fails to start in the guest `vm run` hangs indefinitely — which burned a long debugging session during the golden-image bring-up. After: the ssh wait gets its own 90s deadline. On guest-side timeout the error names the VM, explains sshd is the likely suspect, points at `banger vm logs <name>` for the console output, and notes the VM is still alive for inspection (or `vm delete` to clean up). Parent context cancellation (Ctrl-C, caller timeout) still surfaces as-is without the hint. `vmRunSSHTimeout` is a var rather than a const so tests can shrink it; the new TestRunVMRunSSHTimeoutReturnsActionableError sets it to 50ms and asserts the error message contains the actionable bits. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
ac7974f5b9
commit
3aa64a63c1
2 changed files with 79 additions and 2 deletions
|
|
@ -160,6 +160,15 @@ const vmRunShallowFetchDepth = 10
|
||||||
|
|
||||||
const vmRunToolingInstallTimeoutSeconds = 120
|
const vmRunToolingInstallTimeoutSeconds = 120
|
||||||
|
|
||||||
|
// vmRunSSHTimeout bounds how long `vm run` waits for guest ssh after
|
||||||
|
// the vsock agent is ready. vsock readiness already means systemd
|
||||||
|
// reached the banger-vsock-agent unit in multi-user.target, so sshd
|
||||||
|
// should be up within seconds; a minute plus change is generous
|
||||||
|
// headroom for a slow first boot while still short enough that a
|
||||||
|
// wedged sshd surfaces promptly instead of hanging forever. Var, not
|
||||||
|
// const, so tests can shrink it.
|
||||||
|
var vmRunSSHTimeout = 90 * time.Second
|
||||||
|
|
||||||
func NewBangerCommand() *cobra.Command {
|
func NewBangerCommand() *cobra.Command {
|
||||||
root := &cobra.Command{
|
root := &cobra.Command{
|
||||||
Use: "banger",
|
Use: "banger",
|
||||||
|
|
@ -2797,9 +2806,25 @@ func runVMRun(ctx context.Context, socketPath string, cfg model.DaemonConfig, st
|
||||||
}
|
}
|
||||||
sshAddress := net.JoinHostPort(vm.Runtime.GuestIP, "22")
|
sshAddress := net.JoinHostPort(vm.Runtime.GuestIP, "22")
|
||||||
progress.render("waiting for guest ssh")
|
progress.render("waiting for guest ssh")
|
||||||
if err := guestWaitForSSHFunc(ctx, sshAddress, cfg.SSHKeyPath, 250*time.Millisecond); err != nil {
|
sshCtx, cancelSSH := context.WithTimeout(ctx, vmRunSSHTimeout)
|
||||||
return fmt.Errorf("vm %q is running but guest ssh is unavailable: %w", vmRef, err)
|
if err := guestWaitForSSHFunc(sshCtx, sshAddress, cfg.SSHKeyPath, 250*time.Millisecond); err != nil {
|
||||||
|
cancelSSH()
|
||||||
|
// Surface parent-context cancellation (Ctrl-C, caller
|
||||||
|
// timeout) as-is. Only the guest-side timeout needs the
|
||||||
|
// actionable hint.
|
||||||
|
if errors.Is(ctx.Err(), context.Canceled) || errors.Is(ctx.Err(), context.DeadlineExceeded) {
|
||||||
|
return fmt.Errorf("vm %q: %w", vmRef, ctx.Err())
|
||||||
|
}
|
||||||
|
return fmt.Errorf(
|
||||||
|
"vm %q is running but guest ssh did not come up within %s. "+
|
||||||
|
"sshd is the likely suspect — inspect the guest console with "+
|
||||||
|
"`banger vm logs %s` (look for `Failed to start ssh.service`). "+
|
||||||
|
"The VM is still alive; leave it for inspection or remove with `banger vm delete %s`. "+
|
||||||
|
"underlying error: %w",
|
||||||
|
vmRef, vmRunSSHTimeout, vmRef, vmRef, err,
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
cancelSSH()
|
||||||
if spec != nil {
|
if spec != nil {
|
||||||
progress.render("preparing guest workspace")
|
progress.render("preparing guest workspace")
|
||||||
if _, err := vmWorkspacePrepareFunc(ctx, socketPath, api.VMWorkspacePrepareParams{
|
if _, err := vmWorkspacePrepareFunc(ctx, socketPath, api.VMWorkspacePrepareParams{
|
||||||
|
|
|
||||||
|
|
@ -1616,6 +1616,58 @@ func TestRunVMRunBareModeSkipsWorkspaceAndTooling(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestRunVMRunSSHTimeoutReturnsActionableError(t *testing.T) {
|
||||||
|
origBegin := vmCreateBeginFunc
|
||||||
|
origWaitForSSH := guestWaitForSSHFunc
|
||||||
|
origTimeout := vmRunSSHTimeout
|
||||||
|
vmRunSSHTimeout = 50 * time.Millisecond
|
||||||
|
t.Cleanup(func() {
|
||||||
|
vmCreateBeginFunc = origBegin
|
||||||
|
guestWaitForSSHFunc = origWaitForSSH
|
||||||
|
vmRunSSHTimeout = origTimeout
|
||||||
|
})
|
||||||
|
|
||||||
|
vm := model.VMRecord{
|
||||||
|
ID: "vm-id", Name: "slowvm",
|
||||||
|
Runtime: model.VMRuntime{State: model.VMStateRunning, GuestIP: "172.16.0.2"},
|
||||||
|
}
|
||||||
|
vmCreateBeginFunc = func(context.Context, string, api.VMCreateParams) (api.VMCreateBeginResult, error) {
|
||||||
|
return api.VMCreateBeginResult{Operation: api.VMCreateOperation{ID: "op-1", Stage: "ready", Done: true, Success: true, VM: &vm}}, nil
|
||||||
|
}
|
||||||
|
// Simulate the guest never bringing sshd up — the wait-for-ssh
|
||||||
|
// child context fires its deadline, returning a DeadlineExceeded.
|
||||||
|
guestWaitForSSHFunc = func(ctx context.Context, _, _ string, _ time.Duration) error {
|
||||||
|
<-ctx.Done()
|
||||||
|
return ctx.Err()
|
||||||
|
}
|
||||||
|
|
||||||
|
var stdout, stderr bytes.Buffer
|
||||||
|
err := runVMRun(
|
||||||
|
context.Background(),
|
||||||
|
"/tmp/bangerd.sock",
|
||||||
|
model.DaemonConfig{SSHKeyPath: "/tmp/id_ed25519"},
|
||||||
|
strings.NewReader(""),
|
||||||
|
&stdout, &stderr,
|
||||||
|
api.VMCreateParams{Name: "slowvm"},
|
||||||
|
nil,
|
||||||
|
nil,
|
||||||
|
)
|
||||||
|
if err == nil {
|
||||||
|
t.Fatal("want timeout error")
|
||||||
|
}
|
||||||
|
msg := err.Error()
|
||||||
|
for _, want := range []string{
|
||||||
|
"slowvm",
|
||||||
|
"did not come up",
|
||||||
|
"banger vm logs slowvm",
|
||||||
|
"banger vm delete slowvm",
|
||||||
|
} {
|
||||||
|
if !strings.Contains(msg, want) {
|
||||||
|
t.Fatalf("err = %q, want contains %q", msg, want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestRunVMRunCommandModePropagatesExitCode(t *testing.T) {
|
func TestRunVMRunCommandModePropagatesExitCode(t *testing.T) {
|
||||||
origBegin := vmCreateBeginFunc
|
origBegin := vmCreateBeginFunc
|
||||||
origWaitForSSH := guestWaitForSSHFunc
|
origWaitForSSH := guestWaitForSSHFunc
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue