banger/internal/cli/vm_run.go

package cli

import (
	"bytes"
	"context"
	"errors"
	"fmt"
	"io"
	"net"
	"os"
	"os/exec"
	"path/filepath"
	"strings"
	"time"

	"banger/internal/api"
	"banger/internal/daemon/workspace"
	"banger/internal/model"
	"banger/internal/toolingplan"

	"github.com/spf13/cobra"
)

// vmRunGuestClient is the narrow guest-SSH surface vm run needs. The
// daemon's guest-SSH package returns a value that satisfies this
// interface directly; we restate it here so tests can plug in fakes
// without pulling the full daemon in.
type vmRunGuestClient interface {
	Close() error
	UploadFile(ctx context.Context, remotePath string, mode os.FileMode, data []byte, logWriter io.Writer) error
	RunScript(ctx context.Context, script string, logWriter io.Writer) error
	StreamTar(ctx context.Context, sourceDir, remoteCommand string, logWriter io.Writer) error
	StreamTarEntries(ctx context.Context, sourceDir string, entries []string, remoteCommand string, logWriter io.Writer) error
}

// vmRunRepo is the CLI-local view of the workspace argument to
// `vm run`: an absolute source path that passed preflight, plus the
// two branch flags. Everything else the flow needs (RepoRoot,
// RepoName, HEAD commit, etc.) comes back from the workspace.prepare
// RPC, which does the full git inspection daemon-side.
type vmRunRepo struct {
	sourcePath       string
	branchName       string
	fromRef          string
	includeUntracked bool
}

const vmRunToolingInstallTimeoutSeconds = 120

// vmRunSSHTimeout bounds how long `vm run` waits for guest ssh after
// the vsock agent is ready. vsock readiness already means systemd
// should be up within seconds; a minute plus change is generous
// headroom for a slow first boot while still short enough that a
// wedged sshd surfaces promptly instead of hanging forever. Var, not
// const, so tests can shrink it.
var vmRunSSHTimeout = 90 * time.Second

// ExitCodeError wraps a remote command's exit status so the CLI's main()
// can propagate it verbatim. Only errors explicitly wrapped in this
// type get forwarded as process exit codes — plain *exec.ExitError
// values (from unrelated subprocesses like mkfs.ext4) must still
// surface as regular errors so the user sees a message.
type ExitCodeError struct {
	Code int
}

func (e ExitCodeError) Error() string {
	return fmt.Sprintf("exit status %d", e.Code)
}

// vmRunPreflightRepo validates a vm run workspace path BEFORE the VM
// is created, so bad paths fail fast instead of leaving the user
// with an orphaned VM. The check is intentionally minimal: the
// daemon's PrepareVMWorkspace does a full git inspection (branch,
// HEAD, identity, overlay) and returns everything the tooling
// harness needs, so duplicating the heavy lifting here just doubles
// the I/O. We only enforce what the user can fix locally before
// banger commits to creating a VM:
//
//   - the path exists and is a directory,
//   - it sits inside a non-bare git repository,
//   - the repository has no submodules (unsupported in the shallow
//     overlay mode vm run uses).
func (d *deps) vmRunPreflightRepo(ctx context.Context, rawPath string) (string, error) {
	if strings.TrimSpace(rawPath) == "" {
		wd, err := d.cwd()
		if err != nil {
			return "", err
		}
		rawPath = wd
	}
	sourcePath, err := workspace.ResolveSourcePath(rawPath)
	if err != nil {
		return "", err
	}
	repoRoot, err := d.repoInspector.GitTrimmedOutput(ctx, sourcePath, "rev-parse", "--show-toplevel")
	if err != nil {
		return "", fmt.Errorf("%s is not inside a git repository", sourcePath)
	}
	isBare, err := d.repoInspector.GitTrimmedOutput(ctx, repoRoot, "rev-parse", "--is-bare-repository")
	if err != nil {
		return "", fmt.Errorf("inspect git repository %s: %w", repoRoot, err)
	}
	if isBare == "true" {
		return "", fmt.Errorf("vm run requires a non-bare git repository: %s", repoRoot)
	}
	submodules, err := d.repoInspector.ListSubmodules(ctx, repoRoot)
	if err != nil {
		return "", err
	}
	if len(submodules) > 0 {
		return "", fmt.Errorf("vm run does not support git submodules in %s (%s); use `vm create` + `vm workspace prepare --mode full_copy`", repoRoot, strings.Join(submodules, ", "))
	}
	return sourcePath, nil
}

// repoHasMiseFiles reports whether the repo at sourcePath contains a
// mise tooling manifest. Used as a host-side preflight: when --nat is
// off and a manifest is present, vm run refuses early instead of
// committing to a VM that will silently fail to install tools.
func repoHasMiseFiles(sourcePath string) (bool, error) {
	for _, name := range []string{".mise.toml", ".tool-versions"} {
		info, err := os.Stat(filepath.Join(sourcePath, name))
		if err == nil && !info.IsDir() {
			return true, nil
		}
		if err != nil && !errors.Is(err, os.ErrNotExist) {
			return false, fmt.Errorf("inspect %s: %w", name, err)
		}
	}
	return false, nil
}

// splitVMRunArgs partitions cobra positional args into the optional path
// argument and the trailing command (everything after a `--` separator).
// The path slice may contain 0..1 entries; the command slice may be empty.
func splitVMRunArgs(cmd *cobra.Command, args []string) (pathArgs, commandArgs []string) {
	dash := cmd.ArgsLenAtDash()
	if dash < 0 {
		return args, nil
	}
	if dash > len(args) {
		dash = len(args)
	}
	return args[:dash], args[dash:]
}

// runVMRun orchestrates the full `vm run` flow: create the VM, wait
// for guest ssh, optionally materialise a workspace and kick off the
// tooling bootstrap, then either attach interactively or run the
// user's command and propagate its exit status.
func (d *deps) runVMRun(ctx context.Context, socketPath string, cfg model.DaemonConfig, stdin io.Reader, stdout, stderr io.Writer, params api.VMCreateParams, repo *vmRunRepo, command []string, removeOnExit, detach, skipBootstrap, verbose bool) error {
	if repo != nil && !skipBootstrap && !params.NATEnabled {
		hasMise, err := repoHasMiseFiles(repo.sourcePath)
		if err != nil {
			return err
		}
		if hasMise {
			return errors.New("tooling bootstrap requires --nat (or pass --no-bootstrap to skip)")
		}
	}
	progress := newVMRunProgressRenderer(stderr, verbose)
	defer progress.clear()
	vm, err := d.runVMCreate(ctx, socketPath, stderr, params, verbose)
	if err != nil {
		return err
	}
	vmRef := strings.TrimSpace(vm.Name)
	if vmRef == "" {
		vmRef = shortID(vm.ID)
	}
	// --rm cleanup is wired AFTER ssh is confirmed. An ssh-wait
	// timeout leaves the VM alive for `vm logs` inspection (our
	// error message tells the user that); the cleanup only fires
	// once the session phase runs.
	shouldRemove := false
	if removeOnExit {
		defer func() {
			if !shouldRemove {
				return
			}
			// Use a fresh context so Ctrl-C during the session
			// doesn't abort the delete RPC.
			cleanupCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
			defer cancel()
			if err := d.vmDelete(cleanupCtx, socketPath, vmRef); err != nil {
				progress.clear()
				printVMRunWarning(stderr, fmt.Sprintf("--rm cleanup failed: %v (leaked vm %q; delete manually)", err, vmRef))
			} else if err := removeUserKnownHosts(vm); err != nil {
				progress.clear()
				printVMRunWarning(stderr, fmt.Sprintf("known_hosts cleanup failed: %v", err))
			}
		}()
	}
	sshAddress := net.JoinHostPort(vm.Runtime.GuestIP, "22")
	progress.render("waiting for guest ssh")
	sshCtx, cancelSSH := context.WithTimeout(ctx, vmRunSSHTimeout)
	if err := d.guestWaitForSSH(sshCtx, sshAddress, cfg.SSHKeyPath, 250*time.Millisecond); err != nil {
		cancelSSH()
		// Surface parent-context cancellation (Ctrl-C, caller
		// timeout) as-is. Only the guest-side timeout needs the
		// actionable hint.
		if errors.Is(ctx.Err(), context.Canceled) || errors.Is(ctx.Err(), context.DeadlineExceeded) {
			return fmt.Errorf("vm %q: %w", vmRef, ctx.Err())
		}
		return fmt.Errorf(
			"vm %q is running but guest ssh did not come up within %s. "+
				"sshd is the likely suspect — inspect the guest console with "+
				"`banger vm logs %s` (look for `Failed to start ssh.service`). "+
				"The VM is still alive; leave it for inspection or remove with `banger vm delete %s`. "+
				"underlying error: %w",
			vmRef, vmRunSSHTimeout, vmRef, vmRef, err,
		)
	}
	cancelSSH()
	shouldRemove = removeOnExit
	if repo != nil {
		progress.render("preparing guest workspace")
		// --from is only meaningful paired with --branch; the daemon
		// rejects "from without branch" outright. Our flag default is
		// "HEAD" (useful only when --branch is set), so scrub it when
		// branch is empty to avoid a false "workspace from requires
		// branch" error.
		fromRef := ""
		if strings.TrimSpace(repo.branchName) != "" {
			fromRef = repo.fromRef
		}
		if !repo.includeUntracked {
			progress.clear()
			d.noteUntrackedSkipped(ctx, stderr, repo.sourcePath)
		}
		prepared, err := d.vmWorkspacePrepare(ctx, socketPath, api.VMWorkspacePrepareParams{
			IDOrName:         vmRef,
			SourcePath:       repo.sourcePath,
			GuestPath:        vmRunGuestDir(),
			Branch:           repo.branchName,
			From:             fromRef,
			Mode:             string(model.WorkspacePrepareModeShallowOverlay),
			IncludeUntracked: repo.includeUntracked,
		})
		if err != nil {
			return fmt.Errorf("vm %q is running but workspace prepare failed: %w", vmRef, err)
		}
		// The prepare RPC already did the full git inspection on the
		// daemon side; grab what the tooling harness needs from its
		// result instead of re-inspecting here.
		if len(command) == 0 && !skipBootstrap {
			client, err := d.guestDial(ctx, sshAddress, cfg.SSHKeyPath)
			if err != nil {
				return fmt.Errorf("vm %q is running but guest ssh is unavailable: %w", vmRef, err)
			}
			if err := d.startVMRunToolingHarness(ctx, client, prepared.Workspace.RepoRoot, prepared.Workspace.RepoName, progress, detach, stderr); err != nil {
				progress.clear()
				printVMRunWarning(stderr, fmt.Sprintf("guest tooling bootstrap start failed: %v", err))
			}
			_ = client.Close()
		}
	}
	if detach {
		progress.commitLine(fmt.Sprintf("vm %s running; reconnect with: banger vm ssh %s", vmRef, vmRef))
		return nil
	}
	sshArgs, err := sshCommandArgs(cfg, vm.Runtime.GuestIP, command)
	if err != nil {
		return fmt.Errorf("vm %q is running but ssh args could not be built: %w", vmRef, err)
	}
	if len(command) > 0 {
		progress.render("running command in guest")
		progress.clear()
		if err := d.sshExec(ctx, stdin, stdout, stderr, sshArgs); err != nil {
			var exitErr *exec.ExitError
			if errors.As(err, &exitErr) {
				return ExitCodeError{Code: exitErr.ExitCode()}
			}
			return err
		}
		return nil
	}
	progress.render("attaching to guest")
	progress.clear()
	return d.runSSHSession(ctx, socketPath, vmRef, stdin, stdout, stderr, sshArgs, removeOnExit)
}

func vmRunGuestDir() string {
	return "/root/repo"
}

func vmRunToolingHarnessPath(repoName string) string {
	return filepath.ToSlash(filepath.Join("/tmp", "banger-vm-run-tooling-"+repoName+".sh"))
}

func vmRunToolingHarnessLogPath(repoName string) string {
	return filepath.ToSlash(filepath.Join("/root/.cache/banger", "vm-run-tooling-"+repoName+".log"))
}

// startVMRunToolingHarness uploads + launches the mise bootstrap
// script inside the guest. repoRoot / repoName both come from the
// daemon's workspace.prepare RPC response so the CLI doesn't have
// to re-inspect the git tree.
//
// When wait is true (used by --detach), the harness runs in the
// foreground so the CLI can return only after bootstrap finishes;
// the harness's stdout is streamed to syncOut for live visibility.
// When wait is false (interactive mode), the harness is nohup'd so
// the user's ssh session can start while bootstrap continues.
func (d *deps) startVMRunToolingHarness(ctx context.Context, client vmRunGuestClient, repoRoot, repoName string, progress *vmRunProgressRenderer, wait bool, syncOut io.Writer) error {
	if progress != nil {
		progress.render("starting guest tooling bootstrap")
	}
	plan := d.buildVMRunToolingPlan(ctx, repoRoot)
	var uploadLog bytes.Buffer
	if err := client.UploadFile(ctx, vmRunToolingHarnessPath(repoName), 0o755, []byte(vmRunToolingHarnessScript(plan)), &uploadLog); err != nil {
		return formatVMRunStepError("upload guest tooling bootstrap", err, uploadLog.String())
	}
	if wait {
		var launchLog bytes.Buffer
		out := io.Writer(&launchLog)
		if syncOut != nil {
			out = io.MultiWriter(syncOut, &launchLog)
		}
		if err := client.RunScript(ctx, vmRunToolingHarnessSyncScript(repoName), out); err != nil {
			return formatVMRunStepError("run guest tooling bootstrap", err, launchLog.String())
		}
		if progress != nil {
			progress.render("guest tooling bootstrap done (log: " + vmRunToolingHarnessLogPath(repoName) + ")")
		}
		return nil
	}
	var launchLog bytes.Buffer
	if err := client.RunScript(ctx, vmRunToolingHarnessLaunchScript(repoName), &launchLog); err != nil {
		return formatVMRunStepError("launch guest tooling bootstrap", err, launchLog.String())
	}
	if progress != nil {
		progress.render("guest tooling log: " + vmRunToolingHarnessLogPath(repoName))
	}
	return nil
}

func vmRunToolingHarnessScript(plan toolingplan.Plan) string {
	var script strings.Builder
	script.WriteString("set -uo pipefail\n")
	fmt.Fprintf(&script, "DIR=%s\n", shellQuote(vmRunGuestDir()))
	script.WriteString("export PATH=/usr/local/bin:/root/.local/share/mise/shims:$PATH\n")
	script.WriteString("if [ -f /etc/profile.d/mise.sh ]; then . /etc/profile.d/mise.sh || true; fi\n")
	script.WriteString("log() { printf '%s\\n' \"$*\"; }\n")
	script.WriteString("run_best_effort() {\n")
	script.WriteString("  \"$@\"\n")
	script.WriteString("  rc=$?\n")
	script.WriteString("  if [ \"$rc\" -ne 0 ]; then\n")
	script.WriteString("    log \"command failed ($rc): $*\"\n")
	script.WriteString("  fi\n")
	script.WriteString("  return 0\n")
	script.WriteString("}\n")
	script.WriteString("run_bounded_best_effort() {\n")
	script.WriteString("  timeout_secs=\"$1\"\n")
	script.WriteString("  shift\n")
	script.WriteString("  timeout_marker=\"$(mktemp)\"\n")
	script.WriteString("  rm -f \"$timeout_marker\"\n")
	script.WriteString("  \"$@\" &\n")
	script.WriteString("  cmd_pid=$!\n")
	script.WriteString("  (\n")
	script.WriteString("    sleep \"$timeout_secs\"\n")
	script.WriteString("    if kill -0 \"$cmd_pid\" 2>/dev/null; then\n")
	script.WriteString("      : >\"$timeout_marker\"\n")
	script.WriteString("      log \"command timed out after ${timeout_secs}s: $*\"\n")
	script.WriteString("      kill -TERM \"$cmd_pid\" 2>/dev/null || true\n")
	script.WriteString("      if command -v pkill >/dev/null 2>&1; then pkill -TERM -P \"$cmd_pid\" 2>/dev/null || true; fi\n")
	script.WriteString("      sleep 2\n")
	script.WriteString("      kill -KILL \"$cmd_pid\" 2>/dev/null || true\n")
	script.WriteString("      if command -v pkill >/dev/null 2>&1; then pkill -KILL -P \"$cmd_pid\" 2>/dev/null || true; fi\n")
	script.WriteString("    fi\n")
	script.WriteString("  ) &\n")
	script.WriteString("  watchdog_pid=$!\n")
	script.WriteString("  wait \"$cmd_pid\"\n")
	script.WriteString("  rc=$?\n")
	script.WriteString("  kill \"$watchdog_pid\" 2>/dev/null || true\n")
	script.WriteString("  wait \"$watchdog_pid\" 2>/dev/null || true\n")
	script.WriteString("  if [ -f \"$timeout_marker\" ]; then\n")
	script.WriteString("    rm -f \"$timeout_marker\"\n")
	script.WriteString("    return 0\n")
	script.WriteString("  fi\n")
	script.WriteString("  rm -f \"$timeout_marker\"\n")
	script.WriteString("  if [ \"$rc\" -ne 0 ]; then\n")
	script.WriteString("    log \"command failed ($rc): $*\"\n")
	script.WriteString("  fi\n")
	script.WriteString("  return 0\n")
	script.WriteString("}\n")
	script.WriteString("cd \"$DIR\" || { log \"missing repo directory: $DIR\"; exit 0; }\n")
	script.WriteString("MISE_BIN=\"$(command -v mise || true)\"\n")
	script.WriteString("if [ -z \"$MISE_BIN\" ]; then log \"mise not found; skipping guest tooling bootstrap\"; exit 0; fi\n")
	script.WriteString("log \"starting guest tooling bootstrap in $DIR\"\n")
	if len(plan.RepoManagedTools) > 0 {
		fmt.Fprintf(&script, "log %s\n", shellQuote("repo-managed mise tools: "+strings.Join(plan.RepoManagedTools, ", ")))
	}
	script.WriteString("if [ -f .mise.toml ] || [ -f .tool-versions ]; then\n")
	script.WriteString("  log \"running mise install from repo declarations\"\n")
	script.WriteString("  run_best_effort \"$MISE_BIN\" install\n")
	script.WriteString("fi\n")
	fmt.Fprintf(&script, "INSTALL_TIMEOUT_SECS=%d\n", vmRunToolingInstallTimeoutSeconds)
	for _, step := range plan.Steps {
		stepLabel := fmt.Sprintf("deterministic install: %s@%s (%s)", step.Tool, step.Version, step.Source)
		fmt.Fprintf(&script, "log %s\n", shellQuote(stepLabel))
		fmt.Fprintf(&script, "run_bounded_best_effort \"$INSTALL_TIMEOUT_SECS\" \"$MISE_BIN\" use -g --pin %s\n", shellQuote(step.Tool+"@"+step.Version))
	}
	for _, skip := range plan.Skips {
		skipLabel := fmt.Sprintf("deterministic skip: %s (%s)", skip.Target, skip.Reason)
		fmt.Fprintf(&script, "log %s\n", shellQuote(skipLabel))
	}
	if len(plan.Steps) > 0 {
		script.WriteString("run_best_effort \"$MISE_BIN\" reshim\n")
	}
	script.WriteString("log \"guest tooling bootstrap finished\"\n")
	return script.String()
}

func vmRunToolingHarnessLaunchScript(repoName string) string {
	var script strings.Builder
	script.WriteString("set -euo pipefail\n")
	fmt.Fprintf(&script, "HELPER=%s\n", shellQuote(vmRunToolingHarnessPath(repoName)))
	fmt.Fprintf(&script, "LOG=%s\n", shellQuote(vmRunToolingHarnessLogPath(repoName)))
	script.WriteString("mkdir -p \"$(dirname \"$LOG\")\"\n")
	script.WriteString("nohup bash \"$HELPER\" >\"$LOG\" 2>&1 </dev/null &\n")
	script.WriteString("disown || true\n")
	return script.String()
}

// vmRunToolingHarnessSyncScript is the foreground variant used by
// --detach: it tees the harness output to both the log file and the
// caller's stdout so the host-side CLI can stream live progress while
// still preserving the log for later inspection.
func vmRunToolingHarnessSyncScript(repoName string) string {
	var script strings.Builder
	script.WriteString("set -uo pipefail\n")
	fmt.Fprintf(&script, "HELPER=%s\n", shellQuote(vmRunToolingHarnessPath(repoName)))
	fmt.Fprintf(&script, "LOG=%s\n", shellQuote(vmRunToolingHarnessLogPath(repoName)))
	script.WriteString("mkdir -p \"$(dirname \"$LOG\")\"\n")
	script.WriteString("bash \"$HELPER\" 2>&1 | tee \"$LOG\"\n")
	return script.String()
}

func formatVMRunStepError(action string, err error, log string) error {
	log = strings.TrimSpace(log)
	if log == "" {
		return fmt.Errorf("%s: %w", action, err)
	}
	return fmt.Errorf("%s: %w: %s", action, err, log)
}

type vmRunProgressRenderer struct {
	out      io.Writer
	enabled  bool
	inline   bool
	active   bool
	lastLine string
}

// newVMRunProgressRenderer wires up progress for `vm run`. Unlike the
// vm_create renderer, this one emits in line mode even on non-TTY
// writers (covers tests and piped output that the existing tooling
// already parses); inline mode kicks in only when stderr is a TTY,
// verbose is unset, and BANGER_NO_PROGRESS is unset.
func newVMRunProgressRenderer(out io.Writer, verbose bool) *vmRunProgressRenderer {
	if out == nil {
		return &vmRunProgressRenderer{}
	}
	return &vmRunProgressRenderer{
		out:     out,
		enabled: true,
		inline:  writerSupportsProgress(out) && !verbose && !progressDisabledByEnv(),
	}
}

func (r *vmRunProgressRenderer) render(detail string) {
	if r == nil || !r.enabled {
		return
	}
	line := formatVMRunProgress(detail)
	if line == "" || line == r.lastLine {
		return
	}
	r.lastLine = line
	if r.inline {
		_, _ = fmt.Fprint(r.out, "\r\x1b[K", line)
		r.active = true
		return
	}
	_, _ = fmt.Fprintln(r.out, line)
}

// clear erases the live inline line so the caller can write a clean
// terminating message (warning, ssh attach, command output). No-op
// outside inline mode.
func (r *vmRunProgressRenderer) clear() {
	if r == nil || !r.enabled || !r.inline || !r.active {
		return
	}
	_, _ = fmt.Fprint(r.out, "\r\x1b[K")
	r.active = false
	r.lastLine = ""
}

// commitLine prints detail as a final, persistent line. In inline
// mode it overwrites the live status; in line mode it just appends.
// Used for terminal messages like the --detach hand-off summary.
func (r *vmRunProgressRenderer) commitLine(detail string) {
	if r == nil || !r.enabled {
		return
	}
	line := formatVMRunProgress(detail)
	if line == "" {
		return
	}
	if r.inline {
		_, _ = fmt.Fprint(r.out, "\r\x1b[K", line, "\n")
		r.active = false
		r.lastLine = ""
		return
	}
	if line == r.lastLine {
		return
	}
	r.lastLine = line
	_, _ = fmt.Fprintln(r.out, line)
}

func formatVMRunProgress(detail string) string {
	detail = strings.TrimSpace(detail)
	if detail == "" {
		return ""
	}
	return "[vm run] " + detail
}

func printVMRunWarning(out io.Writer, detail string) {
	detail = strings.TrimSpace(detail)
	if out == nil || detail == "" {
		return
	}
	_, _ = fmt.Fprintln(out, "[vm run] warning: "+detail)
}