banger/internal/daemon/workspace.go
Thales Maciel 687fcf0b59
vm state: split transient kernel/process handles off the durable schema
Separates what a VM IS (durable intent + identity + deterministic
derived paths — `VMRuntime`) from what is CURRENTLY TRUE about it
(firecracker PID, tap device, loop devices, dm-snapshot target — new
`VMHandles`). The durable state lives in the SQLite `vms` row; the
transient state lives in an in-memory cache on the daemon plus a
per-VM `handles.json` scratch file inside VMDir, rebuilt at startup
from OS inspection. Nothing kernel-level rides the SQLite schema
anymore.

Why:

  Persisting ephemeral process handles to SQLite forced reconcile to
  treat "running with a stale PID" as a first-class case and mix it
  with real state transitions. The schema described what we last
  observed, not what the VM is. Every time the observation model
  shifted (tap pool, DM naming, pgrep fallback) the reconcile logic
  grew a new branch. Splitting lets each layer own what it's good at:
  durable records describe intent, in-memory cache + scratch file
  describe momentary reality.

Shape:

  - `model.VMHandles` = PID, TapDevice, BaseLoop, COWLoop, DMName,
    DMDev. Never in SQLite.
  - `VMRuntime` keeps: State, GuestIP, APISockPath, VSockPath,
    VSockCID, LogPath, MetricsPath, DNSName, VMDir, SystemOverlay,
    WorkDiskPath, LastError. All durable or deterministic.
  - `handleCache` on `*Daemon` — mutex-guarded map + scratch-file
    plumbing (`writeHandlesFile` / `readHandlesFile` /
    `rediscoverHandles`). See `internal/daemon/vm_handles.go`.
  - `d.vmAlive(vm)` replaces the 20+ inline
    `vm.State==Running && ProcessRunning(vm.Runtime.PID, apiSock)`
    spreads. Single source of truth for liveness.
  - Startup reconcile: per running VM, load the scratch file, pgrep
    the api sock, either keep (cache seeded from scratch) or demote
    to stopped (scratch handles passed to cleanupRuntime first so DM
    / loops / tap actually get torn down).

Verification:

  - `go test ./...` green.
  - Live: `banger vm run --name handles-test -- cat /etc/hostname`
    starts; `handles.json` appears in VMDir with the expected PID,
    tap, loops, DM.
  - `kill -9 $(pgrep bangerd)` while the VM is running, re-invoke the
    CLI, daemon auto-starts, reconcile recognises the VM as alive,
    `banger vm ssh` still connects, `banger vm delete` cleans up.

Tests added:

  - vm_handles_test.go: scratch-file roundtrip, missing/corrupt file
    behaviour, cache concurrency, rediscoverHandles prefers pgrep
    over scratch, returns scratch contents even when process is
    dead (so cleanup can tear down kernel state).
  - vm_test.go: reconcile test rewritten to exercise the new flow
    (write scratch → reconcile reads it → verifies process is gone →
    issues dmsetup/losetup teardown).

ARCHITECTURE.md updated; `handles` added to Daemon field docs.
2026-04-19 14:18:13 -03:00

199 lines
7.4 KiB
Go

package daemon
import (
"bytes"
"context"
"errors"
"fmt"
"net"
"strings"
"time"
"banger/internal/api"
sess "banger/internal/daemon/session"
ws "banger/internal/daemon/workspace"
"banger/internal/model"
)
// Test seams. Tests swap these to observe or stall the guest-I/O
// phase without needing a real git repo or SSH server. Production
// callers see the real implementations from the workspace package.
var (
workspaceInspectRepoFunc = ws.InspectRepo
workspaceImportFunc = ws.ImportRepoToGuest
)
func (d *Daemon) ExportVMWorkspace(ctx context.Context, params api.WorkspaceExportParams) (api.WorkspaceExportResult, error) {
guestPath := strings.TrimSpace(params.GuestPath)
if guestPath == "" {
guestPath = "/root/repo"
}
vm, err := d.FindVM(ctx, params.IDOrName)
if err != nil {
return api.WorkspaceExportResult{}, err
}
if !d.vmAlive(vm) {
return api.WorkspaceExportResult{}, fmt.Errorf("vm %q is not running", vm.Name)
}
// Serialise with any in-flight workspace.prepare on the same VM so
// we never snapshot a half-streamed tar. Does not block vm stop /
// delete / restart — those only take the VM mutex.
unlock := d.workspaceLocks.lock(vm.ID)
defer unlock()
client, err := d.dialGuest(ctx, net.JoinHostPort(vm.Runtime.GuestIP, "22"))
if err != nil {
return api.WorkspaceExportResult{}, fmt.Errorf("dial guest: %w", err)
}
defer client.Close()
// diffRef is the git ref everything is diffed against.
// When the caller supplies BaseCommit (the HEAD at workspace.prepare time),
// we diff against that fixed point so committed guest changes are included.
// Without it we fall back to HEAD, which silently drops them.
diffRef := strings.TrimSpace(params.BaseCommit)
if diffRef == "" {
diffRef = "HEAD"
}
// Both scripts run `git add -A` to capture the working tree
// (committed deltas + uncommitted modifications + untracked files
// minus .gitignore), but they route it through a throwaway index
// file instead of .git/index. Export is an observation step; the
// user's real staging area must stay exactly as they left it.
patchScript := exportScript(guestPath, diffRef, "--binary")
patch, err := client.RunScriptOutput(ctx, patchScript)
if err != nil {
return api.WorkspaceExportResult{}, fmt.Errorf("export workspace diff: %w", err)
}
namesScript := exportScript(guestPath, diffRef, "--name-only")
namesOut, _ := client.RunScriptOutput(ctx, namesScript)
var changed []string
for _, line := range strings.Split(strings.TrimSpace(string(namesOut)), "\n") {
if line = strings.TrimSpace(line); line != "" {
changed = append(changed, line)
}
}
return api.WorkspaceExportResult{
GuestPath: guestPath,
BaseCommit: diffRef,
Patch: patch,
ChangedFiles: changed,
HasChanges: len(patch) > 0,
}, nil
}
// exportScript emits a shell snippet that diffs the working tree at
// guestPath against diffRef (HEAD or a commit SHA) WITHOUT touching
// the repo's real index. diffFlag selects the git-diff output mode
// ("--binary" for the patch body, "--name-only" for the file list).
//
// Mechanics: seed a temp index from diffRef's tree via git read-tree,
// restage the working tree into that temp index with GIT_INDEX_FILE,
// then emit the diff. The temp index is rm'd on exit via trap.
func exportScript(guestPath, diffRef, diffFlag string) string {
return fmt.Sprintf(
"set -euo pipefail\n"+
"cd %s\n"+
"tmp_idx=\"$(mktemp \"${TMPDIR:-/tmp}/banger-export.XXXXXX\")\"\n"+
"trap 'rm -f \"$tmp_idx\"' EXIT\n"+
"git read-tree %s --index-output=\"$tmp_idx\"\n"+
"GIT_INDEX_FILE=\"$tmp_idx\" git add -A\n"+
"GIT_INDEX_FILE=\"$tmp_idx\" git diff --cached %s %s\n",
sess.ShellQuote(guestPath),
sess.ShellQuote(diffRef),
sess.ShellQuote(diffRef),
diffFlag,
)
}
func (d *Daemon) PrepareVMWorkspace(ctx context.Context, params api.VMWorkspacePrepareParams) (model.WorkspacePrepareResult, error) {
mode, err := ws.ParsePrepareMode(params.Mode)
if err != nil {
return model.WorkspacePrepareResult{}, err
}
guestPath := strings.TrimSpace(params.GuestPath)
if guestPath == "" {
guestPath = "/root/repo"
}
branchName := strings.TrimSpace(params.Branch)
fromRef := strings.TrimSpace(params.From)
if branchName != "" && fromRef == "" {
fromRef = "HEAD"
}
if branchName == "" && strings.TrimSpace(params.From) != "" {
return model.WorkspacePrepareResult{}, errors.New("workspace from requires branch")
}
// Phase 1: acquire the VM mutex ONLY long enough to verify state
// and snapshot the fields we need (IP, PID, api sock). Release it
// before any SSH or tar I/O so this slow operation cannot block
// vm stop / vm delete / vm restart on the same VM.
vm, err := d.withVMLockByRef(ctx, params.IDOrName, func(vm model.VMRecord) (model.VMRecord, error) {
if !d.vmAlive(vm) {
return model.VMRecord{}, fmt.Errorf("vm %q is not running", vm.Name)
}
return vm, nil
})
if err != nil {
return model.WorkspacePrepareResult{}, err
}
// Phase 2: serialise concurrent workspace operations on THIS vm
// (so two prepares don't interleave tar streams), but do not
// block lifecycle ops. If the VM gets stopped or deleted mid-
// flight, the SSH dial or stream will fail naturally; ctx
// cancellation propagates through.
unlock := d.workspaceLocks.lock(vm.ID)
defer unlock()
return d.prepareVMWorkspaceGuestIO(ctx, vm, strings.TrimSpace(params.SourcePath), guestPath, branchName, fromRef, mode, params.ReadOnly)
}
// prepareVMWorkspaceGuestIO performs the actual guest-side work:
// inspect the local repo, dial SSH, stream the tar, optionally chmod
// readonly. It is called without holding the VM mutex.
func (d *Daemon) prepareVMWorkspaceGuestIO(ctx context.Context, vm model.VMRecord, sourcePath, guestPath, branchName, fromRef string, mode model.WorkspacePrepareMode, readOnly bool) (model.WorkspacePrepareResult, error) {
spec, err := workspaceInspectRepoFunc(ctx, sourcePath, branchName, fromRef)
if err != nil {
return model.WorkspacePrepareResult{}, err
}
if len(spec.Submodules) > 0 && mode != model.WorkspacePrepareModeFullCopy {
return model.WorkspacePrepareResult{}, fmt.Errorf("workspace mode %q does not support git submodules in %s (%s); use --mode full_copy", mode, spec.RepoRoot, strings.Join(spec.Submodules, ", "))
}
address := net.JoinHostPort(vm.Runtime.GuestIP, "22")
if err := d.waitForGuestSSH(ctx, address, 250*time.Millisecond); err != nil {
return model.WorkspacePrepareResult{}, fmt.Errorf("guest ssh unavailable: %w", err)
}
client, err := d.dialGuest(ctx, address)
if err != nil {
return model.WorkspacePrepareResult{}, fmt.Errorf("dial guest ssh: %w", err)
}
defer client.Close()
if err := workspaceImportFunc(ctx, client, spec, guestPath, mode); err != nil {
return model.WorkspacePrepareResult{}, err
}
if readOnly {
var chmodLog bytes.Buffer
chmodScript := fmt.Sprintf("set -euo pipefail\nchmod -R a-w %s\n", sess.ShellQuote(guestPath))
if err := client.RunScript(ctx, chmodScript, &chmodLog); err != nil {
return model.WorkspacePrepareResult{}, sess.FormatStepError("set workspace readonly", err, chmodLog.String())
}
}
return model.WorkspacePrepareResult{
VMID: vm.ID,
SourcePath: spec.SourcePath,
RepoRoot: spec.RepoRoot,
RepoName: spec.RepoName,
GuestPath: guestPath,
Mode: mode,
ReadOnly: readOnly,
HeadCommit: spec.HeadCommit,
CurrentBranch: spec.CurrentBranch,
BranchName: spec.BranchName,
BaseCommit: spec.BaseCommit,
PreparedAt: model.Now(),
}, nil
}