Add workspace export and baseline diff

Complete the 2.6.0 workspace milestone by adding explicit host-out export and immutable-baseline diff across the CLI, Python SDK, and MCP server.

Capture a baseline archive at workspace creation, export live /workspace paths through the guest agent, and compute structured whole-workspace diffs on the host without affecting command logs or shell state. The docs, roadmap, bundled guest agent, and workspace example now reflect the new create -> sync -> diff -> export workflow.

Validation: uv lock, UV_CACHE_DIR=.uv-cache make check, UV_CACHE_DIR=.uv-cache make dist-check, and a real guest-backed Firecracker smoke covering workspace create, sync push, diff, export, and delete.
This commit is contained in:
Thales Maciel 2026-03-12 03:15:45 -03:00
parent 3f8293ad24
commit 84a7e18d4d
26 changed files with 1492 additions and 43 deletions

View file

@ -2,6 +2,7 @@
from __future__ import annotations
import difflib
import json
import os
import shlex
@ -43,7 +44,9 @@ DEFAULT_TIMEOUT_SECONDS = 30
DEFAULT_TTL_SECONDS = 600
DEFAULT_ALLOW_HOST_COMPAT = False
WORKSPACE_LAYOUT_VERSION = 3
WORKSPACE_LAYOUT_VERSION = 4
WORKSPACE_BASELINE_DIRNAME = "baseline"
WORKSPACE_BASELINE_ARCHIVE_NAME = "workspace.tar"
WORKSPACE_DIRNAME = "workspace"
WORKSPACE_COMMANDS_DIRNAME = "commands"
WORKSPACE_SHELLS_DIRNAME = "shells"
@ -57,6 +60,7 @@ DEFAULT_SHELL_MAX_CHARS = 65536
WORKSPACE_SHELL_SIGNAL_NAMES = shell_signal_names()
WorkspaceSeedMode = Literal["empty", "directory", "tar_archive"]
WorkspaceArtifactType = Literal["file", "directory", "symlink"]
@dataclass
@ -287,6 +291,24 @@ class VmExecResult:
duration_ms: int
@dataclass(frozen=True)
class ExportedWorkspaceArchive:
workspace_path: str
artifact_type: WorkspaceArtifactType
archive_path: Path
entry_count: int
bytes_written: int
@dataclass(frozen=True)
class WorkspaceTreeEntry:
path: str
artifact_type: WorkspaceArtifactType
disk_path: Path
size_bytes: int = 0
link_target: str | None = None
def _optional_int(value: object) -> int | None:
if value is None:
return None
@ -522,6 +544,66 @@ def _write_directory_seed_archive(source_dir: Path, archive_path: Path) -> None:
archive.add(child, arcname=child.name, recursive=True)
def _write_empty_seed_archive(archive_path: Path) -> None:
archive_path.parent.mkdir(parents=True, exist_ok=True)
with tarfile.open(archive_path, "w"):
pass
def _persist_workspace_baseline(
prepared_seed: PreparedWorkspaceSeed,
*,
baseline_archive_path: Path,
) -> None:
baseline_archive_path.parent.mkdir(parents=True, exist_ok=True)
if prepared_seed.archive_path is None:
_write_empty_seed_archive(baseline_archive_path)
return
shutil.copy2(prepared_seed.archive_path, baseline_archive_path)
def _write_workspace_export_archive(
source_path: Path,
*,
archive_path: Path,
) -> WorkspaceArtifactType:
archive_path.parent.mkdir(parents=True, exist_ok=True)
if source_path.is_symlink():
artifact_type: WorkspaceArtifactType = "symlink"
elif source_path.is_file():
artifact_type = "file"
elif source_path.is_dir():
artifact_type = "directory"
else:
raise RuntimeError(f"unsupported workspace path type: {source_path}")
def validate_source(current_path: Path, relative_path: PurePosixPath) -> None:
if current_path.is_symlink():
_validate_archive_symlink_target(relative_path, os.readlink(current_path))
return
if current_path.is_file():
return
if current_path.is_dir():
for child in sorted(current_path.iterdir(), key=lambda item: item.name):
validate_source(child, relative_path / child.name)
return
raise RuntimeError(f"unsupported workspace path type: {current_path}")
if artifact_type == "directory":
for child in sorted(source_path.iterdir(), key=lambda item: item.name):
validate_source(child, PurePosixPath(child.name))
else:
validate_source(source_path, PurePosixPath(source_path.name))
with tarfile.open(archive_path, "w") as archive:
archive.dereference = False
if artifact_type == "directory":
for child in sorted(source_path.iterdir(), key=lambda item: item.name):
archive.add(child, arcname=child.name, recursive=True)
else:
archive.add(source_path, arcname=source_path.name, recursive=False)
return artifact_type
def _extract_seed_archive_to_host_workspace(
archive_path: Path,
*,
@ -576,6 +658,120 @@ def _extract_seed_archive_to_host_workspace(
}
def _prepare_workspace_export_archive(
*,
workspace_dir: Path,
workspace_path: str,
archive_path: Path,
) -> ExportedWorkspaceArchive:
normalized_workspace_path, _ = _normalize_workspace_destination(workspace_path)
source_path = _workspace_host_destination(workspace_dir, normalized_workspace_path)
if not source_path.exists() and not source_path.is_symlink():
raise RuntimeError(f"workspace path does not exist: {normalized_workspace_path}")
artifact_type = _write_workspace_export_archive(source_path, archive_path=archive_path)
entry_count, bytes_written = _inspect_seed_archive(archive_path)
return ExportedWorkspaceArchive(
workspace_path=normalized_workspace_path,
artifact_type=artifact_type,
archive_path=archive_path,
entry_count=entry_count,
bytes_written=bytes_written,
)
def _extract_workspace_export_archive(
archive_path: Path,
*,
output_path: Path,
artifact_type: WorkspaceArtifactType,
) -> dict[str, Any]:
output_path.parent.mkdir(parents=True, exist_ok=True)
if output_path.exists() or output_path.is_symlink():
raise RuntimeError(f"output_path already exists: {output_path}")
entry_count = 0
bytes_written = 0
if artifact_type == "directory":
output_path.mkdir(parents=True, exist_ok=False)
with tarfile.open(archive_path, "r:*") as archive:
for member in archive.getmembers():
member_name = _normalize_archive_member_name(member.name)
target_path = output_path.joinpath(*member_name.parts)
entry_count += 1
_ensure_no_symlink_parents(output_path, target_path, member.name)
if member.isdir():
if target_path.is_symlink() or (
target_path.exists() and not target_path.is_dir()
):
raise RuntimeError(f"directory conflicts with existing path: {member.name}")
target_path.mkdir(parents=True, exist_ok=True)
continue
if member.isfile():
target_path.parent.mkdir(parents=True, exist_ok=True)
if target_path.is_symlink() or target_path.is_dir():
raise RuntimeError(f"file conflicts with existing path: {member.name}")
source = archive.extractfile(member)
if source is None:
raise RuntimeError(f"failed to read archive member: {member.name}")
with target_path.open("wb") as handle:
shutil.copyfileobj(source, handle)
bytes_written += member.size
continue
if member.issym():
_validate_archive_symlink_target(member_name, member.linkname)
target_path.parent.mkdir(parents=True, exist_ok=True)
if target_path.exists() and not target_path.is_symlink():
raise RuntimeError(f"symlink conflicts with existing path: {member.name}")
if target_path.is_symlink():
target_path.unlink()
os.symlink(member.linkname, target_path)
continue
if member.islnk():
raise RuntimeError(
f"hard links are not allowed in workspace archives: {member.name}"
)
raise RuntimeError(f"unsupported archive member type: {member.name}")
return {
"output_path": str(output_path),
"artifact_type": artifact_type,
"entry_count": entry_count,
"bytes_written": bytes_written,
}
with tarfile.open(archive_path, "r:*") as archive:
members = archive.getmembers()
if len(members) != 1:
raise RuntimeError(
"expected exactly one archive member for "
f"{artifact_type} export, got {len(members)}"
)
member = members[0]
_normalize_archive_member_name(member.name)
entry_count = 1
if artifact_type == "file":
if not member.isfile():
raise RuntimeError("exported archive did not contain a regular file")
source = archive.extractfile(member)
if source is None:
raise RuntimeError(f"failed to read archive member: {member.name}")
with output_path.open("wb") as handle:
shutil.copyfileobj(source, handle)
bytes_written = member.size
elif artifact_type == "symlink":
if not member.issym():
raise RuntimeError("exported archive did not contain a symlink")
_validate_archive_symlink_target(PurePosixPath(member.name), member.linkname)
os.symlink(member.linkname, output_path)
else:
raise RuntimeError(f"unsupported artifact type: {artifact_type}")
return {
"output_path": str(output_path),
"artifact_type": artifact_type,
"entry_count": entry_count,
"bytes_written": bytes_written,
}
def _instance_workspace_host_dir(instance: VmInstance) -> Path:
raw_value = instance.metadata.get("workspace_host_dir")
if raw_value is None or raw_value == "":
@ -640,6 +836,205 @@ def _pid_is_running(pid: int | None) -> bool:
return True
def _collect_workspace_tree(root: Path) -> dict[str, WorkspaceTreeEntry]:
entries: dict[str, WorkspaceTreeEntry] = {}
def walk(current: Path, relative_parts: tuple[str, ...] = ()) -> bool:
has_entries = False
for child in sorted(current.iterdir(), key=lambda item: item.name):
child_relative_parts = relative_parts + (child.name,)
relative_path = "/".join(child_relative_parts)
if child.is_symlink():
entries[relative_path] = WorkspaceTreeEntry(
path=relative_path,
artifact_type="symlink",
disk_path=child,
link_target=os.readlink(child),
)
has_entries = True
continue
if child.is_file():
entries[relative_path] = WorkspaceTreeEntry(
path=relative_path,
artifact_type="file",
disk_path=child,
size_bytes=child.stat().st_size,
)
has_entries = True
continue
if child.is_dir():
child_has_entries = walk(child, child_relative_parts)
if not child_has_entries:
entries[relative_path] = WorkspaceTreeEntry(
path=relative_path,
artifact_type="directory",
disk_path=child,
)
has_entries = True
else:
has_entries = True
continue
raise RuntimeError(f"unsupported workspace artifact type: {child}")
return has_entries
walk(root)
return entries
def _is_probably_text(data: bytes) -> bool:
if b"\x00" in data:
return False
try:
data.decode("utf-8")
except UnicodeDecodeError:
return False
return True
def _build_text_patch(
*,
path: str,
before_text: str,
after_text: str,
status: str,
) -> str:
if status == "added":
fromfile = "/dev/null"
tofile = f"b/{path}"
elif status == "deleted":
fromfile = f"a/{path}"
tofile = "/dev/null"
else:
fromfile = f"a/{path}"
tofile = f"b/{path}"
lines = list(
difflib.unified_diff(
before_text.splitlines(keepends=True),
after_text.splitlines(keepends=True),
fromfile=fromfile,
tofile=tofile,
n=3,
)
)
if not lines:
return ""
return "".join(lines)
def _diff_workspace_trees(
baseline_root: Path,
current_root: Path,
) -> dict[str, Any]:
baseline_entries = _collect_workspace_tree(baseline_root)
current_entries = _collect_workspace_tree(current_root)
changed_entries: list[dict[str, Any]] = []
patch_parts: list[str] = []
summary = {
"total": 0,
"added": 0,
"modified": 0,
"deleted": 0,
"type_changed": 0,
"text_patched": 0,
"non_text": 0,
}
for path in sorted(set(baseline_entries) | set(current_entries)):
baseline_entry = baseline_entries.get(path)
current_entry = current_entries.get(path)
entry_payload: dict[str, Any] | None = None
text_patch = ""
if baseline_entry is None and current_entry is not None:
entry_payload = {
"path": path,
"status": "added",
"artifact_type": current_entry.artifact_type,
"text_patch": None,
}
if current_entry.artifact_type == "file":
current_bytes = current_entry.disk_path.read_bytes()
if _is_probably_text(current_bytes):
text_patch = _build_text_patch(
path=path,
before_text="",
after_text=current_bytes.decode("utf-8"),
status="added",
)
elif current_entry is None and baseline_entry is not None:
entry_payload = {
"path": path,
"status": "deleted",
"artifact_type": baseline_entry.artifact_type,
"text_patch": None,
}
if baseline_entry.artifact_type == "file":
baseline_bytes = baseline_entry.disk_path.read_bytes()
if _is_probably_text(baseline_bytes):
text_patch = _build_text_patch(
path=path,
before_text=baseline_bytes.decode("utf-8"),
after_text="",
status="deleted",
)
elif baseline_entry is not None and current_entry is not None:
if baseline_entry.artifact_type != current_entry.artifact_type:
entry_payload = {
"path": path,
"status": "type_changed",
"artifact_type": current_entry.artifact_type,
"text_patch": None,
}
elif current_entry.artifact_type == "directory":
continue
elif current_entry.artifact_type == "symlink":
if baseline_entry.link_target != current_entry.link_target:
entry_payload = {
"path": path,
"status": "modified",
"artifact_type": current_entry.artifact_type,
"text_patch": None,
}
else:
baseline_bytes = baseline_entry.disk_path.read_bytes()
current_bytes = current_entry.disk_path.read_bytes()
if baseline_bytes == current_bytes:
continue
entry_payload = {
"path": path,
"status": "modified",
"artifact_type": current_entry.artifact_type,
"text_patch": None,
}
if _is_probably_text(baseline_bytes) and _is_probably_text(current_bytes):
text_patch = _build_text_patch(
path=path,
before_text=baseline_bytes.decode("utf-8"),
after_text=current_bytes.decode("utf-8"),
status="modified",
)
if entry_payload is None:
continue
summary["total"] += 1
summary[str(entry_payload["status"])] += 1
if text_patch != "":
entry_payload["text_patch"] = text_patch
patch_parts.append(text_patch)
summary["text_patched"] += 1
else:
summary["non_text"] += 1
changed_entries.append(entry_payload)
return {
"changed": bool(changed_entries),
"summary": summary,
"entries": changed_entries,
"patch": "".join(patch_parts),
}
class VmBackend:
"""Backend interface for lifecycle operations."""
@ -674,6 +1069,15 @@ class VmBackend:
) -> dict[str, Any]:
raise NotImplementedError
def export_archive( # pragma: no cover
self,
instance: VmInstance,
*,
workspace_path: str,
archive_path: Path,
) -> dict[str, Any]:
raise NotImplementedError
def open_shell( # pragma: no cover
self,
instance: VmInstance,
@ -768,6 +1172,26 @@ class MockBackend(VmBackend):
destination=destination,
)
def export_archive(
self,
instance: VmInstance,
*,
workspace_path: str,
archive_path: Path,
) -> dict[str, Any]:
exported = _prepare_workspace_export_archive(
workspace_dir=_instance_workspace_host_dir(instance),
workspace_path=workspace_path,
archive_path=archive_path,
)
return {
"workspace_path": exported.workspace_path,
"artifact_type": exported.artifact_type,
"entry_count": exported.entry_count,
"bytes_written": exported.bytes_written,
"execution_mode": "host_compat",
}
def open_shell(
self,
instance: VmInstance,
@ -1086,6 +1510,55 @@ class FirecrackerBackend(VmBackend): # pragma: no cover
destination=destination,
)
def export_archive(
self,
instance: VmInstance,
*,
workspace_path: str,
archive_path: Path,
) -> dict[str, Any]:
if self._runtime_capabilities.supports_guest_exec:
guest_cid = int(instance.metadata["guest_cid"])
port = int(instance.metadata["guest_exec_port"])
uds_path = instance.metadata.get("guest_exec_uds_path")
deadline = time.monotonic() + 10
while True:
try:
response = self._guest_exec_client.export_archive(
guest_cid,
port,
workspace_path=workspace_path,
archive_path=archive_path,
timeout_seconds=WORKSPACE_ARCHIVE_UPLOAD_TIMEOUT_SECONDS,
uds_path=uds_path,
)
return {
"workspace_path": response.workspace_path,
"artifact_type": response.artifact_type,
"entry_count": response.entry_count,
"bytes_written": response.bytes_written,
"execution_mode": instance.metadata.get("execution_mode", "pending"),
}
except (OSError, RuntimeError) as exc:
if time.monotonic() >= deadline:
raise RuntimeError(
f"guest export transport did not become ready: {exc}"
) from exc
time.sleep(0.2)
instance.metadata["execution_mode"] = "host_compat"
exported = _prepare_workspace_export_archive(
workspace_dir=_instance_workspace_host_dir(instance),
workspace_path=workspace_path,
archive_path=archive_path,
)
return {
"workspace_path": exported.workspace_path,
"artifact_type": exported.artifact_type,
"entry_count": exported.entry_count,
"bytes_written": exported.bytes_written,
"execution_mode": "host_compat",
}
def open_shell(
self,
instance: VmInstance,
@ -1488,10 +1961,15 @@ class VmManager:
host_workspace_dir = self._workspace_host_dir(workspace_id)
commands_dir = self._workspace_commands_dir(workspace_id)
shells_dir = self._workspace_shells_dir(workspace_id)
baseline_archive_path = self._workspace_baseline_archive_path(workspace_id)
workspace_dir.mkdir(parents=True, exist_ok=False)
host_workspace_dir.mkdir(parents=True, exist_ok=True)
commands_dir.mkdir(parents=True, exist_ok=True)
shells_dir.mkdir(parents=True, exist_ok=True)
_persist_workspace_baseline(
prepared_seed,
baseline_archive_path=baseline_archive_path,
)
instance = VmInstance(
vm_id=workspace_id,
environment=environment,
@ -1523,23 +2001,14 @@ class VmManager:
self._start_instance_locked(instance)
self._require_guest_exec_or_opt_in(instance)
workspace_seed = prepared_seed.to_payload()
if prepared_seed.archive_path is not None:
import_summary = self._backend.import_archive(
instance,
archive_path=prepared_seed.archive_path,
destination=WORKSPACE_GUEST_PATH,
)
workspace_seed["entry_count"] = int(import_summary["entry_count"])
workspace_seed["bytes_written"] = int(import_summary["bytes_written"])
workspace_seed["destination"] = str(import_summary["destination"])
elif self._runtime_capabilities.supports_guest_exec:
self._backend.exec(
instance,
f"mkdir -p {shlex.quote(WORKSPACE_GUEST_PATH)}",
10,
)
else:
instance.metadata["execution_mode"] = "host_compat"
import_summary = self._backend.import_archive(
instance,
archive_path=baseline_archive_path,
destination=WORKSPACE_GUEST_PATH,
)
workspace_seed["entry_count"] = int(import_summary["entry_count"])
workspace_seed["bytes_written"] = int(import_summary["bytes_written"])
workspace_seed["destination"] = str(import_summary["destination"])
workspace = WorkspaceRecord.from_instance(instance, workspace_seed=workspace_seed)
self._save_workspace_locked(workspace)
return self._serialize_workspace(workspace)
@ -1612,6 +2081,98 @@ class VmManager:
"workspace_sync": workspace_sync,
}
def export_workspace(
self,
workspace_id: str,
*,
path: str,
output_path: str | Path,
) -> dict[str, Any]:
normalized_path, _ = _normalize_workspace_destination(path)
raw_output_path = str(output_path).strip()
if raw_output_path == "":
raise ValueError("output_path must not be empty")
resolved_output_path = Path(output_path).expanduser().resolve()
with self._lock:
workspace = self._load_workspace_locked(workspace_id)
instance = self._workspace_instance_for_live_operation_locked(
workspace,
operation_name="workspace_export",
)
with tempfile.TemporaryDirectory(prefix="pyro-workspace-export-") as temp_dir:
archive_path = Path(temp_dir) / "workspace-export.tar"
exported = self._backend.export_archive(
instance,
workspace_path=normalized_path,
archive_path=archive_path,
)
extracted = _extract_workspace_export_archive(
archive_path,
output_path=resolved_output_path,
artifact_type=cast(WorkspaceArtifactType, str(exported["artifact_type"])),
)
with self._lock:
workspace = self._load_workspace_locked(workspace_id)
workspace.state = instance.state
workspace.firecracker_pid = instance.firecracker_pid
workspace.last_error = instance.last_error
workspace.metadata = dict(instance.metadata)
self._save_workspace_locked(workspace)
return {
"workspace_id": workspace_id,
"workspace_path": normalized_path,
"output_path": str(Path(str(extracted["output_path"]))),
"artifact_type": extracted["artifact_type"],
"entry_count": int(extracted["entry_count"]),
"bytes_written": int(extracted["bytes_written"]),
"execution_mode": str(
exported.get("execution_mode", instance.metadata.get("execution_mode", "pending"))
),
}
def diff_workspace(self, workspace_id: str) -> dict[str, Any]:
with self._lock:
workspace = self._load_workspace_locked(workspace_id)
instance = self._workspace_instance_for_live_operation_locked(
workspace,
operation_name="workspace_diff",
)
baseline_archive_path = self._workspace_baseline_archive_path(workspace_id)
if not baseline_archive_path.exists():
raise RuntimeError(
"workspace diff requires a baseline snapshot. Recreate the workspace to use diff."
)
with tempfile.TemporaryDirectory(prefix="pyro-workspace-diff-") as temp_dir:
temp_root = Path(temp_dir)
current_archive_path = temp_root / "current.tar"
baseline_root = temp_root / "baseline"
current_root = temp_root / "current"
self._backend.export_archive(
instance,
workspace_path=WORKSPACE_GUEST_PATH,
archive_path=current_archive_path,
)
_extract_seed_archive_to_host_workspace(
baseline_archive_path,
workspace_dir=baseline_root,
destination=WORKSPACE_GUEST_PATH,
)
_extract_seed_archive_to_host_workspace(
current_archive_path,
workspace_dir=current_root,
destination=WORKSPACE_GUEST_PATH,
)
diff_payload = _diff_workspace_trees(baseline_root, current_root)
with self._lock:
workspace = self._load_workspace_locked(workspace_id)
workspace.state = instance.state
workspace.firecracker_pid = instance.firecracker_pid
workspace.last_error = instance.last_error
workspace.metadata = dict(instance.metadata)
self._save_workspace_locked(workspace)
diff_payload["workspace_id"] = workspace_id
return diff_payload
def exec_workspace(
self,
workspace_id: str,
@ -2136,6 +2697,12 @@ class VmManager:
def _workspace_host_dir(self, workspace_id: str) -> Path:
return self._workspace_dir(workspace_id) / WORKSPACE_DIRNAME
def _workspace_baseline_dir(self, workspace_id: str) -> Path:
return self._workspace_dir(workspace_id) / WORKSPACE_BASELINE_DIRNAME
def _workspace_baseline_archive_path(self, workspace_id: str) -> Path:
return self._workspace_baseline_dir(workspace_id) / WORKSPACE_BASELINE_ARCHIVE_NAME
def _workspace_commands_dir(self, workspace_id: str) -> Path:
return self._workspace_dir(workspace_id) / WORKSPACE_COMMANDS_DIRNAME
@ -2278,17 +2845,29 @@ class VmManager:
return entries
def _workspace_instance_for_live_shell_locked(self, workspace: WorkspaceRecord) -> VmInstance:
instance = self._workspace_instance_for_live_operation_locked(
workspace,
operation_name="shell operations",
)
self._require_workspace_shell_support(instance)
return instance
def _workspace_instance_for_live_operation_locked(
self,
workspace: WorkspaceRecord,
*,
operation_name: str,
) -> VmInstance:
self._ensure_workspace_not_expired_locked(workspace, time.time())
self._refresh_workspace_liveness_locked(workspace)
if workspace.state != "started":
raise RuntimeError(
"workspace "
f"{workspace.workspace_id} must be in 'started' state before shell operations"
f"{workspace.workspace_id} must be in 'started' state before {operation_name}"
)
instance = workspace.to_instance(
workdir=self._workspace_runtime_dir(workspace.workspace_id)
)
self._require_workspace_shell_support(instance)
return instance
def _workspace_shell_record_from_payload(