banger/internal/cli/vm_run.go
2026-05-01 19:34:44 -03:00

540 lines
20 KiB
Go

package cli
import (
"bytes"
"context"
"errors"
"fmt"
"io"
"net"
"os"
"os/exec"
"path/filepath"
"strings"
"time"
"banger/internal/api"
"banger/internal/daemon/workspace"
"banger/internal/model"
"banger/internal/toolingplan"
"github.com/spf13/cobra"
)
// vmRunGuestClient is the narrow guest-SSH surface vm run needs. The
// daemon's guest-SSH package returns a value that satisfies this
// interface directly; we restate it here so tests can plug in fakes
// without pulling the full daemon in.
type vmRunGuestClient interface {
Close() error
UploadFile(ctx context.Context, remotePath string, mode os.FileMode, data []byte, logWriter io.Writer) error
RunScript(ctx context.Context, script string, logWriter io.Writer) error
StreamTar(ctx context.Context, sourceDir, remoteCommand string, logWriter io.Writer) error
StreamTarEntries(ctx context.Context, sourceDir string, entries []string, remoteCommand string, logWriter io.Writer) error
}
// vmRunRepo is the CLI-local view of the workspace argument to
// `vm run`: an absolute source path that passed preflight, plus the
// two branch flags. Everything else the flow needs (RepoRoot,
// RepoName, HEAD commit, etc.) comes back from the workspace.prepare
// RPC, which does the full git inspection daemon-side.
type vmRunRepo struct {
sourcePath string
branchName string
fromRef string
includeUntracked bool
}
const vmRunToolingInstallTimeoutSeconds = 120
// vmRunSSHTimeout bounds how long `vm run` waits for guest ssh after
// the vsock agent is ready. vsock readiness already means systemd
// should be up within seconds; a minute plus change is generous
// headroom for a slow first boot while still short enough that a
// wedged sshd surfaces promptly instead of hanging forever. Var, not
// const, so tests can shrink it.
var vmRunSSHTimeout = 90 * time.Second
// ExitCodeError wraps a remote command's exit status so the CLI's main()
// can propagate it verbatim. Only errors explicitly wrapped in this
// type get forwarded as process exit codes — plain *exec.ExitError
// values (from unrelated subprocesses like mkfs.ext4) must still
// surface as regular errors so the user sees a message.
type ExitCodeError struct {
Code int
}
func (e ExitCodeError) Error() string {
return fmt.Sprintf("exit status %d", e.Code)
}
// vmRunPreflightRepo validates a vm run workspace path BEFORE the VM
// is created, so bad paths fail fast instead of leaving the user
// with an orphaned VM. The check is intentionally minimal: the
// daemon's PrepareVMWorkspace does a full git inspection (branch,
// HEAD, identity, overlay) and returns everything the tooling
// harness needs, so duplicating the heavy lifting here just doubles
// the I/O. We only enforce what the user can fix locally before
// banger commits to creating a VM:
//
// - the path exists and is a directory,
// - it sits inside a non-bare git repository,
// - the repository has no submodules (unsupported in the shallow
// overlay mode vm run uses).
func (d *deps) vmRunPreflightRepo(ctx context.Context, rawPath string) (string, error) {
if strings.TrimSpace(rawPath) == "" {
wd, err := d.cwd()
if err != nil {
return "", err
}
rawPath = wd
}
sourcePath, err := workspace.ResolveSourcePath(rawPath)
if err != nil {
return "", err
}
repoRoot, err := d.repoInspector.GitTrimmedOutput(ctx, sourcePath, "rev-parse", "--show-toplevel")
if err != nil {
return "", fmt.Errorf("%s is not inside a git repository", sourcePath)
}
isBare, err := d.repoInspector.GitTrimmedOutput(ctx, repoRoot, "rev-parse", "--is-bare-repository")
if err != nil {
return "", fmt.Errorf("inspect git repository %s: %w", repoRoot, err)
}
if isBare == "true" {
return "", fmt.Errorf("vm run requires a non-bare git repository: %s", repoRoot)
}
submodules, err := d.repoInspector.ListSubmodules(ctx, repoRoot)
if err != nil {
return "", err
}
if len(submodules) > 0 {
return "", fmt.Errorf("vm run does not support git submodules in %s (%s); use `vm create` + `vm workspace prepare --mode full_copy`", repoRoot, strings.Join(submodules, ", "))
}
return sourcePath, nil
}
// repoHasMiseFiles reports whether the repo at sourcePath contains a
// mise tooling manifest. Used as a host-side preflight: when --nat is
// off and a manifest is present, vm run refuses early instead of
// committing to a VM that will silently fail to install tools.
func repoHasMiseFiles(sourcePath string) (bool, error) {
for _, name := range []string{".mise.toml", ".tool-versions"} {
info, err := os.Stat(filepath.Join(sourcePath, name))
if err == nil && !info.IsDir() {
return true, nil
}
if err != nil && !errors.Is(err, os.ErrNotExist) {
return false, fmt.Errorf("inspect %s: %w", name, err)
}
}
return false, nil
}
// splitVMRunArgs partitions cobra positional args into the optional path
// argument and the trailing command (everything after a `--` separator).
// The path slice may contain 0..1 entries; the command slice may be empty.
func splitVMRunArgs(cmd *cobra.Command, args []string) (pathArgs, commandArgs []string) {
dash := cmd.ArgsLenAtDash()
if dash < 0 {
return args, nil
}
if dash > len(args) {
dash = len(args)
}
return args[:dash], args[dash:]
}
// runVMRun orchestrates the full `vm run` flow: create the VM, wait
// for guest ssh, optionally materialise a workspace and kick off the
// tooling bootstrap, then either attach interactively or run the
// user's command and propagate its exit status.
func (d *deps) runVMRun(ctx context.Context, socketPath string, cfg model.DaemonConfig, stdin io.Reader, stdout, stderr io.Writer, params api.VMCreateParams, repo *vmRunRepo, command []string, removeOnExit, detach, skipBootstrap, verbose bool) error {
if repo != nil && !skipBootstrap && !params.NATEnabled {
hasMise, err := repoHasMiseFiles(repo.sourcePath)
if err != nil {
return err
}
if hasMise {
return errors.New("tooling bootstrap requires --nat (or pass --no-bootstrap to skip)")
}
}
progress := newVMRunProgressRenderer(stderr, verbose)
defer progress.clear()
vm, err := d.runVMCreate(ctx, socketPath, stderr, params, verbose)
if err != nil {
return err
}
vmRef := strings.TrimSpace(vm.Name)
if vmRef == "" {
vmRef = shortID(vm.ID)
}
// --rm cleanup is wired AFTER ssh is confirmed. An ssh-wait
// timeout leaves the VM alive for `vm logs` inspection (our
// error message tells the user that); the cleanup only fires
// once the session phase runs.
shouldRemove := false
if removeOnExit {
defer func() {
if !shouldRemove {
return
}
// Use a fresh context so Ctrl-C during the session
// doesn't abort the delete RPC.
cleanupCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
if err := d.vmDelete(cleanupCtx, socketPath, vmRef); err != nil {
progress.clear()
printVMRunWarning(stderr, fmt.Sprintf("--rm cleanup failed: %v (leaked vm %q; delete manually)", err, vmRef))
} else if err := removeUserKnownHosts(vm); err != nil {
progress.clear()
printVMRunWarning(stderr, fmt.Sprintf("known_hosts cleanup failed: %v", err))
}
}()
}
sshAddress := net.JoinHostPort(vm.Runtime.GuestIP, "22")
progress.render("waiting for guest ssh")
sshCtx, cancelSSH := context.WithTimeout(ctx, vmRunSSHTimeout)
if err := d.guestWaitForSSH(sshCtx, sshAddress, cfg.SSHKeyPath, 250*time.Millisecond); err != nil {
cancelSSH()
// Surface parent-context cancellation (Ctrl-C, caller
// timeout) as-is. Only the guest-side timeout needs the
// actionable hint.
if errors.Is(ctx.Err(), context.Canceled) || errors.Is(ctx.Err(), context.DeadlineExceeded) {
return fmt.Errorf("vm %q: %w", vmRef, ctx.Err())
}
return fmt.Errorf(
"vm %q is running but guest ssh did not come up within %s. "+
"sshd is the likely suspect — inspect the guest console with "+
"`banger vm logs %s` (look for `Failed to start ssh.service`). "+
"The VM is still alive; leave it for inspection or remove with `banger vm delete %s`. "+
"underlying error: %w",
vmRef, vmRunSSHTimeout, vmRef, vmRef, err,
)
}
cancelSSH()
shouldRemove = removeOnExit
if repo != nil {
progress.render("preparing guest workspace")
// --from is only meaningful paired with --branch; the daemon
// rejects "from without branch" outright. Our flag default is
// "HEAD" (useful only when --branch is set), so scrub it when
// branch is empty to avoid a false "workspace from requires
// branch" error.
fromRef := ""
if strings.TrimSpace(repo.branchName) != "" {
fromRef = repo.fromRef
}
if !repo.includeUntracked {
progress.clear()
d.noteUntrackedSkipped(ctx, stderr, repo.sourcePath)
}
prepared, err := d.vmWorkspacePrepare(ctx, socketPath, api.VMWorkspacePrepareParams{
IDOrName: vmRef,
SourcePath: repo.sourcePath,
GuestPath: vmRunGuestDir(),
Branch: repo.branchName,
From: fromRef,
Mode: string(model.WorkspacePrepareModeShallowOverlay),
IncludeUntracked: repo.includeUntracked,
})
if err != nil {
return fmt.Errorf("vm %q is running but workspace prepare failed: %w", vmRef, err)
}
// The prepare RPC already did the full git inspection on the
// daemon side; grab what the tooling harness needs from its
// result instead of re-inspecting here.
if len(command) == 0 && !skipBootstrap {
client, err := d.guestDial(ctx, sshAddress, cfg.SSHKeyPath)
if err != nil {
return fmt.Errorf("vm %q is running but guest ssh is unavailable: %w", vmRef, err)
}
if err := d.startVMRunToolingHarness(ctx, client, prepared.Workspace.RepoRoot, prepared.Workspace.RepoName, progress, detach, stderr); err != nil {
progress.clear()
printVMRunWarning(stderr, fmt.Sprintf("guest tooling bootstrap start failed: %v", err))
}
_ = client.Close()
}
}
if detach {
progress.commitLine(fmt.Sprintf("vm %s running; reconnect with: banger vm ssh %s", vmRef, vmRef))
return nil
}
sshArgs, err := sshCommandArgs(cfg, vm.Runtime.GuestIP, command)
if err != nil {
return fmt.Errorf("vm %q is running but ssh args could not be built: %w", vmRef, err)
}
if len(command) > 0 {
progress.render("running command in guest")
progress.clear()
if err := d.sshExec(ctx, stdin, stdout, stderr, sshArgs); err != nil {
var exitErr *exec.ExitError
if errors.As(err, &exitErr) {
return ExitCodeError{Code: exitErr.ExitCode()}
}
return err
}
return nil
}
progress.render("attaching to guest")
progress.clear()
return d.runSSHSession(ctx, socketPath, vmRef, stdin, stdout, stderr, sshArgs, removeOnExit)
}
func vmRunGuestDir() string {
return "/root/repo"
}
func vmRunToolingHarnessPath(repoName string) string {
return filepath.ToSlash(filepath.Join("/tmp", "banger-vm-run-tooling-"+repoName+".sh"))
}
func vmRunToolingHarnessLogPath(repoName string) string {
return filepath.ToSlash(filepath.Join("/root/.cache/banger", "vm-run-tooling-"+repoName+".log"))
}
// startVMRunToolingHarness uploads + launches the mise bootstrap
// script inside the guest. repoRoot / repoName both come from the
// daemon's workspace.prepare RPC response so the CLI doesn't have
// to re-inspect the git tree.
//
// When wait is true (used by --detach), the harness runs in the
// foreground so the CLI can return only after bootstrap finishes;
// the harness's stdout is streamed to syncOut for live visibility.
// When wait is false (interactive mode), the harness is nohup'd so
// the user's ssh session can start while bootstrap continues.
func (d *deps) startVMRunToolingHarness(ctx context.Context, client vmRunGuestClient, repoRoot, repoName string, progress *vmRunProgressRenderer, wait bool, syncOut io.Writer) error {
if progress != nil {
progress.render("starting guest tooling bootstrap")
}
plan := d.buildVMRunToolingPlan(ctx, repoRoot)
var uploadLog bytes.Buffer
if err := client.UploadFile(ctx, vmRunToolingHarnessPath(repoName), 0o755, []byte(vmRunToolingHarnessScript(plan)), &uploadLog); err != nil {
return formatVMRunStepError("upload guest tooling bootstrap", err, uploadLog.String())
}
if wait {
var launchLog bytes.Buffer
out := io.Writer(&launchLog)
if syncOut != nil {
out = io.MultiWriter(syncOut, &launchLog)
}
if err := client.RunScript(ctx, vmRunToolingHarnessSyncScript(repoName), out); err != nil {
return formatVMRunStepError("run guest tooling bootstrap", err, launchLog.String())
}
if progress != nil {
progress.render("guest tooling bootstrap done (log: " + vmRunToolingHarnessLogPath(repoName) + ")")
}
return nil
}
var launchLog bytes.Buffer
if err := client.RunScript(ctx, vmRunToolingHarnessLaunchScript(repoName), &launchLog); err != nil {
return formatVMRunStepError("launch guest tooling bootstrap", err, launchLog.String())
}
if progress != nil {
progress.render("guest tooling log: " + vmRunToolingHarnessLogPath(repoName))
}
return nil
}
func vmRunToolingHarnessScript(plan toolingplan.Plan) string {
var script strings.Builder
script.WriteString("set -uo pipefail\n")
fmt.Fprintf(&script, "DIR=%s\n", shellQuote(vmRunGuestDir()))
script.WriteString("export PATH=/usr/local/bin:/root/.local/share/mise/shims:$PATH\n")
script.WriteString("if [ -f /etc/profile.d/mise.sh ]; then . /etc/profile.d/mise.sh || true; fi\n")
script.WriteString("log() { printf '%s\\n' \"$*\"; }\n")
script.WriteString("run_best_effort() {\n")
script.WriteString(" \"$@\"\n")
script.WriteString(" rc=$?\n")
script.WriteString(" if [ \"$rc\" -ne 0 ]; then\n")
script.WriteString(" log \"command failed ($rc): $*\"\n")
script.WriteString(" fi\n")
script.WriteString(" return 0\n")
script.WriteString("}\n")
script.WriteString("run_bounded_best_effort() {\n")
script.WriteString(" timeout_secs=\"$1\"\n")
script.WriteString(" shift\n")
script.WriteString(" timeout_marker=\"$(mktemp)\"\n")
script.WriteString(" rm -f \"$timeout_marker\"\n")
script.WriteString(" \"$@\" &\n")
script.WriteString(" cmd_pid=$!\n")
script.WriteString(" (\n")
script.WriteString(" sleep \"$timeout_secs\"\n")
script.WriteString(" if kill -0 \"$cmd_pid\" 2>/dev/null; then\n")
script.WriteString(" : >\"$timeout_marker\"\n")
script.WriteString(" log \"command timed out after ${timeout_secs}s: $*\"\n")
script.WriteString(" kill -TERM \"$cmd_pid\" 2>/dev/null || true\n")
script.WriteString(" if command -v pkill >/dev/null 2>&1; then pkill -TERM -P \"$cmd_pid\" 2>/dev/null || true; fi\n")
script.WriteString(" sleep 2\n")
script.WriteString(" kill -KILL \"$cmd_pid\" 2>/dev/null || true\n")
script.WriteString(" if command -v pkill >/dev/null 2>&1; then pkill -KILL -P \"$cmd_pid\" 2>/dev/null || true; fi\n")
script.WriteString(" fi\n")
script.WriteString(" ) &\n")
script.WriteString(" watchdog_pid=$!\n")
script.WriteString(" wait \"$cmd_pid\"\n")
script.WriteString(" rc=$?\n")
script.WriteString(" kill \"$watchdog_pid\" 2>/dev/null || true\n")
script.WriteString(" wait \"$watchdog_pid\" 2>/dev/null || true\n")
script.WriteString(" if [ -f \"$timeout_marker\" ]; then\n")
script.WriteString(" rm -f \"$timeout_marker\"\n")
script.WriteString(" return 0\n")
script.WriteString(" fi\n")
script.WriteString(" rm -f \"$timeout_marker\"\n")
script.WriteString(" if [ \"$rc\" -ne 0 ]; then\n")
script.WriteString(" log \"command failed ($rc): $*\"\n")
script.WriteString(" fi\n")
script.WriteString(" return 0\n")
script.WriteString("}\n")
script.WriteString("cd \"$DIR\" || { log \"missing repo directory: $DIR\"; exit 0; }\n")
script.WriteString("MISE_BIN=\"$(command -v mise || true)\"\n")
script.WriteString("if [ -z \"$MISE_BIN\" ]; then log \"mise not found; skipping guest tooling bootstrap\"; exit 0; fi\n")
script.WriteString("log \"starting guest tooling bootstrap in $DIR\"\n")
if len(plan.RepoManagedTools) > 0 {
fmt.Fprintf(&script, "log %s\n", shellQuote("repo-managed mise tools: "+strings.Join(plan.RepoManagedTools, ", ")))
}
script.WriteString("if [ -f .mise.toml ] || [ -f .tool-versions ]; then\n")
script.WriteString(" log \"running mise install from repo declarations\"\n")
script.WriteString(" run_best_effort \"$MISE_BIN\" install\n")
script.WriteString("fi\n")
fmt.Fprintf(&script, "INSTALL_TIMEOUT_SECS=%d\n", vmRunToolingInstallTimeoutSeconds)
for _, step := range plan.Steps {
stepLabel := fmt.Sprintf("deterministic install: %s@%s (%s)", step.Tool, step.Version, step.Source)
fmt.Fprintf(&script, "log %s\n", shellQuote(stepLabel))
fmt.Fprintf(&script, "run_bounded_best_effort \"$INSTALL_TIMEOUT_SECS\" \"$MISE_BIN\" use -g --pin %s\n", shellQuote(step.Tool+"@"+step.Version))
}
for _, skip := range plan.Skips {
skipLabel := fmt.Sprintf("deterministic skip: %s (%s)", skip.Target, skip.Reason)
fmt.Fprintf(&script, "log %s\n", shellQuote(skipLabel))
}
if len(plan.Steps) > 0 {
script.WriteString("run_best_effort \"$MISE_BIN\" reshim\n")
}
script.WriteString("log \"guest tooling bootstrap finished\"\n")
return script.String()
}
func vmRunToolingHarnessLaunchScript(repoName string) string {
var script strings.Builder
script.WriteString("set -euo pipefail\n")
fmt.Fprintf(&script, "HELPER=%s\n", shellQuote(vmRunToolingHarnessPath(repoName)))
fmt.Fprintf(&script, "LOG=%s\n", shellQuote(vmRunToolingHarnessLogPath(repoName)))
script.WriteString("mkdir -p \"$(dirname \"$LOG\")\"\n")
script.WriteString("nohup bash \"$HELPER\" >\"$LOG\" 2>&1 </dev/null &\n")
script.WriteString("disown || true\n")
return script.String()
}
// vmRunToolingHarnessSyncScript is the foreground variant used by
// --detach: it tees the harness output to both the log file and the
// caller's stdout so the host-side CLI can stream live progress while
// still preserving the log for later inspection.
func vmRunToolingHarnessSyncScript(repoName string) string {
var script strings.Builder
script.WriteString("set -uo pipefail\n")
fmt.Fprintf(&script, "HELPER=%s\n", shellQuote(vmRunToolingHarnessPath(repoName)))
fmt.Fprintf(&script, "LOG=%s\n", shellQuote(vmRunToolingHarnessLogPath(repoName)))
script.WriteString("mkdir -p \"$(dirname \"$LOG\")\"\n")
script.WriteString("bash \"$HELPER\" 2>&1 | tee \"$LOG\"\n")
return script.String()
}
func formatVMRunStepError(action string, err error, log string) error {
log = strings.TrimSpace(log)
if log == "" {
return fmt.Errorf("%s: %w", action, err)
}
return fmt.Errorf("%s: %w: %s", action, err, log)
}
type vmRunProgressRenderer struct {
out io.Writer
enabled bool
inline bool
active bool
lastLine string
}
// newVMRunProgressRenderer wires up progress for `vm run`. Unlike the
// vm_create renderer, this one emits in line mode even on non-TTY
// writers (covers tests and piped output that the existing tooling
// already parses); inline mode kicks in only when stderr is a TTY,
// verbose is unset, and BANGER_NO_PROGRESS is unset.
func newVMRunProgressRenderer(out io.Writer, verbose bool) *vmRunProgressRenderer {
if out == nil {
return &vmRunProgressRenderer{}
}
return &vmRunProgressRenderer{
out: out,
enabled: true,
inline: writerSupportsProgress(out) && !verbose && !progressDisabledByEnv(),
}
}
func (r *vmRunProgressRenderer) render(detail string) {
if r == nil || !r.enabled {
return
}
line := formatVMRunProgress(detail)
if line == "" || line == r.lastLine {
return
}
r.lastLine = line
if r.inline {
_, _ = fmt.Fprint(r.out, "\r\x1b[K", line)
r.active = true
return
}
_, _ = fmt.Fprintln(r.out, line)
}
// clear erases the live inline line so the caller can write a clean
// terminating message (warning, ssh attach, command output). No-op
// outside inline mode.
func (r *vmRunProgressRenderer) clear() {
if r == nil || !r.enabled || !r.inline || !r.active {
return
}
_, _ = fmt.Fprint(r.out, "\r\x1b[K")
r.active = false
r.lastLine = ""
}
// commitLine prints detail as a final, persistent line. In inline
// mode it overwrites the live status; in line mode it just appends.
// Used for terminal messages like the --detach hand-off summary.
func (r *vmRunProgressRenderer) commitLine(detail string) {
if r == nil || !r.enabled {
return
}
line := formatVMRunProgress(detail)
if line == "" {
return
}
if r.inline {
_, _ = fmt.Fprint(r.out, "\r\x1b[K", line, "\n")
r.active = false
r.lastLine = ""
return
}
if line == r.lastLine {
return
}
r.lastLine = line
_, _ = fmt.Fprintln(r.out, line)
}
func formatVMRunProgress(detail string) string {
detail = strings.TrimSpace(detail)
if detail == "" {
return ""
}
return "[vm run] " + detail
}
func printVMRunWarning(out io.Writer, detail string) {
detail = strings.TrimSpace(detail)
if out == nil || detail == "" {
return
}
_, _ = fmt.Fprintln(out, "[vm run] warning: "+detail)
}