Add workspace snapshots and full reset

Implement the 2.8.0 workspace milestone with named snapshots and full-sandbox reset across the CLI, Python SDK, and MCP server.

Persist the immutable baseline plus named snapshot archives under each workspace, add workspace reset metadata, and make reset recreate the sandbox while clearing command history, shells, and services without changing the workspace identity or diff baseline.

Refresh the 2.8.0 docs, roadmap, and Python example around reset-over-repair, then validate with uv lock, UV_CACHE_DIR=.uv-cache make check, UV_CACHE_DIR=.uv-cache make dist-check, and a real guest-backed create/snapshot/reset/diff smoke test outside the sandbox.
This commit is contained in:
Thales Maciel 2026-03-12 12:41:11 -03:00
parent f504f0a331
commit 18b8fd2a7d
20 changed files with 1429 additions and 29 deletions

View file

@ -49,9 +49,10 @@ DEFAULT_TIMEOUT_SECONDS = 30
DEFAULT_TTL_SECONDS = 600
DEFAULT_ALLOW_HOST_COMPAT = False
WORKSPACE_LAYOUT_VERSION = 4
WORKSPACE_LAYOUT_VERSION = 5
WORKSPACE_BASELINE_DIRNAME = "baseline"
WORKSPACE_BASELINE_ARCHIVE_NAME = "workspace.tar"
WORKSPACE_SNAPSHOTS_DIRNAME = "snapshots"
WORKSPACE_DIRNAME = "workspace"
WORKSPACE_COMMANDS_DIRNAME = "commands"
WORKSPACE_SHELLS_DIRNAME = "shells"
@ -68,10 +69,12 @@ DEFAULT_SERVICE_READY_INTERVAL_MS = 500
DEFAULT_SERVICE_LOG_TAIL_LINES = 200
WORKSPACE_SHELL_SIGNAL_NAMES = shell_signal_names()
WORKSPACE_SERVICE_NAME_RE = re.compile(r"^[A-Za-z0-9][A-Za-z0-9._-]{0,63}$")
WORKSPACE_SNAPSHOT_NAME_RE = re.compile(r"^[A-Za-z0-9][A-Za-z0-9._-]{0,63}$")
WorkspaceSeedMode = Literal["empty", "directory", "tar_archive"]
WorkspaceArtifactType = Literal["file", "directory", "symlink"]
WorkspaceServiceReadinessType = Literal["file", "tcp", "http", "command"]
WorkspaceSnapshotKind = Literal["baseline", "named"]
@dataclass
@ -116,6 +119,8 @@ class WorkspaceRecord:
command_count: int = 0
last_command: dict[str, Any] | None = None
workspace_seed: dict[str, Any] = field(default_factory=dict)
reset_count: int = 0
last_reset_at: float | None = None
@classmethod
def from_instance(
@ -144,6 +149,8 @@ class WorkspaceRecord:
command_count=command_count,
last_command=last_command,
workspace_seed=dict(workspace_seed or _empty_workspace_seed_payload()),
reset_count=0,
last_reset_at=None,
)
def to_instance(self, *, workdir: Path) -> VmInstance:
@ -185,6 +192,8 @@ class WorkspaceRecord:
"command_count": self.command_count,
"last_command": self.last_command,
"workspace_seed": self.workspace_seed,
"reset_count": self.reset_count,
"last_reset_at": self.last_reset_at,
}
@classmethod
@ -207,6 +216,46 @@ class WorkspaceRecord:
command_count=int(payload.get("command_count", 0)),
last_command=_optional_dict(payload.get("last_command")),
workspace_seed=_workspace_seed_dict(payload.get("workspace_seed")),
reset_count=int(payload.get("reset_count", 0)),
last_reset_at=(
None
if payload.get("last_reset_at") is None
else float(payload.get("last_reset_at", 0.0))
),
)
@dataclass
class WorkspaceSnapshotRecord:
"""Persistent snapshot metadata stored on disk per workspace."""
workspace_id: str
snapshot_name: str
kind: WorkspaceSnapshotKind
created_at: float
entry_count: int
bytes_written: int
def to_payload(self) -> dict[str, Any]:
return {
"layout_version": WORKSPACE_LAYOUT_VERSION,
"workspace_id": self.workspace_id,
"snapshot_name": self.snapshot_name,
"kind": self.kind,
"created_at": self.created_at,
"entry_count": self.entry_count,
"bytes_written": self.bytes_written,
}
@classmethod
def from_payload(cls, payload: dict[str, Any]) -> WorkspaceSnapshotRecord:
return cls(
workspace_id=str(payload["workspace_id"]),
snapshot_name=str(payload["snapshot_name"]),
kind=cast(WorkspaceSnapshotKind, str(payload.get("kind", "named"))),
created_at=float(payload["created_at"]),
entry_count=int(payload.get("entry_count", 0)),
bytes_written=int(payload.get("bytes_written", 0)),
)
@ -864,6 +913,24 @@ def _normalize_workspace_service_name(service_name: str) -> str:
return normalized
def _normalize_workspace_snapshot_name(
snapshot_name: str,
*,
allow_baseline: bool = False,
) -> str:
normalized = snapshot_name.strip()
if normalized == "":
raise ValueError("snapshot_name must not be empty")
if normalized == "baseline" and not allow_baseline:
raise ValueError("snapshot_name 'baseline' is reserved")
if WORKSPACE_SNAPSHOT_NAME_RE.fullmatch(normalized) is None:
raise ValueError(
"snapshot_name must match "
r"^[A-Za-z0-9][A-Za-z0-9._-]{0,63}$"
)
return normalized
def _normalize_workspace_service_readiness(
readiness: dict[str, Any] | None,
) -> dict[str, Any] | None:
@ -2646,12 +2713,14 @@ class VmManager:
commands_dir = self._workspace_commands_dir(workspace_id)
shells_dir = self._workspace_shells_dir(workspace_id)
services_dir = self._workspace_services_dir(workspace_id)
snapshots_dir = self._workspace_snapshots_dir(workspace_id)
baseline_archive_path = self._workspace_baseline_archive_path(workspace_id)
workspace_dir.mkdir(parents=True, exist_ok=False)
host_workspace_dir.mkdir(parents=True, exist_ok=True)
commands_dir.mkdir(parents=True, exist_ok=True)
shells_dir.mkdir(parents=True, exist_ok=True)
services_dir.mkdir(parents=True, exist_ok=True)
snapshots_dir.mkdir(parents=True, exist_ok=True)
_persist_workspace_baseline(
prepared_seed,
baseline_archive_path=baseline_archive_path,
@ -2859,6 +2928,192 @@ class VmManager:
diff_payload["workspace_id"] = workspace_id
return diff_payload
def create_snapshot(
self,
workspace_id: str,
snapshot_name: str,
) -> dict[str, Any]:
normalized_snapshot_name = _normalize_workspace_snapshot_name(snapshot_name)
with self._lock:
workspace = self._load_workspace_locked(workspace_id)
self._ensure_workspace_not_expired_locked(workspace, time.time())
self._workspace_baseline_snapshot_locked(workspace)
if (
self._load_workspace_snapshot_locked_optional(
workspace_id,
normalized_snapshot_name,
)
is not None
):
raise ValueError(
f"snapshot {normalized_snapshot_name!r} already exists in workspace "
f"{workspace_id!r}"
)
instance = self._workspace_instance_for_live_operation_locked(
workspace,
operation_name="workspace_snapshot_create",
)
with tempfile.TemporaryDirectory(prefix="pyro-workspace-snapshot-") as temp_dir:
temp_archive_path = Path(temp_dir) / f"{normalized_snapshot_name}.tar"
exported = self._backend.export_archive(
instance,
workspace_path=WORKSPACE_GUEST_PATH,
archive_path=temp_archive_path,
)
snapshot = WorkspaceSnapshotRecord(
workspace_id=workspace_id,
snapshot_name=normalized_snapshot_name,
kind="named",
created_at=time.time(),
entry_count=int(exported["entry_count"]),
bytes_written=int(exported["bytes_written"]),
)
with self._lock:
workspace = self._load_workspace_locked(workspace_id)
self._ensure_workspace_not_expired_locked(workspace, time.time())
if (
self._load_workspace_snapshot_locked_optional(
workspace_id,
normalized_snapshot_name,
)
is not None
):
raise ValueError(
f"snapshot {normalized_snapshot_name!r} already exists in workspace "
f"{workspace_id!r}"
)
archive_path = self._workspace_snapshot_archive_path(
workspace_id,
normalized_snapshot_name,
)
archive_path.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(temp_archive_path, archive_path)
workspace.state = instance.state
workspace.firecracker_pid = instance.firecracker_pid
workspace.last_error = instance.last_error
workspace.metadata = dict(instance.metadata)
self._save_workspace_locked(workspace)
self._save_workspace_snapshot_locked(snapshot)
return {
"workspace_id": workspace_id,
"snapshot": self._serialize_workspace_snapshot(snapshot),
"execution_mode": instance.metadata.get("execution_mode", "pending"),
}
def list_snapshots(self, workspace_id: str) -> dict[str, Any]:
with self._lock:
workspace = self._load_workspace_locked(workspace_id)
self._ensure_workspace_not_expired_locked(workspace, time.time())
snapshots = self._list_workspace_snapshots_locked(workspace)
return {
"workspace_id": workspace_id,
"count": len(snapshots),
"snapshots": [
self._serialize_workspace_snapshot(snapshot) for snapshot in snapshots
],
}
def delete_snapshot(self, workspace_id: str, snapshot_name: str) -> dict[str, Any]:
normalized_snapshot_name = _normalize_workspace_snapshot_name(
snapshot_name,
allow_baseline=True,
)
if normalized_snapshot_name == "baseline":
raise ValueError("cannot delete the baseline snapshot")
with self._lock:
workspace = self._load_workspace_locked(workspace_id)
self._ensure_workspace_not_expired_locked(workspace, time.time())
self._workspace_baseline_snapshot_locked(workspace)
self._load_workspace_snapshot_locked(workspace_id, normalized_snapshot_name)
self._delete_workspace_snapshot_locked(workspace_id, normalized_snapshot_name)
return {
"workspace_id": workspace_id,
"snapshot_name": normalized_snapshot_name,
"deleted": True,
}
def reset_workspace(
self,
workspace_id: str,
*,
snapshot: str = "baseline",
) -> dict[str, Any]:
with self._lock:
workspace = self._load_workspace_locked(workspace_id)
self._ensure_workspace_not_expired_locked(workspace, time.time())
self._refresh_workspace_liveness_locked(workspace)
selected_snapshot, archive_path = self._resolve_workspace_snapshot_locked(
workspace,
snapshot,
)
instance = workspace.to_instance(
workdir=self._workspace_runtime_dir(workspace.workspace_id)
)
self._stop_workspace_services_locked(workspace, instance)
self._close_workspace_shells_locked(workspace, instance)
if workspace.state == "started":
self._backend.stop(instance)
workspace.state = "stopped"
self._backend.delete(instance)
workspace.state = "stopped"
workspace.firecracker_pid = None
workspace.last_error = None
self._reset_workspace_runtime_dirs(workspace_id)
self._save_workspace_locked(workspace)
recreated: VmInstance | None = None
try:
recreated = workspace.to_instance(
workdir=self._workspace_runtime_dir(workspace.workspace_id)
)
self._backend.create(recreated)
if self._runtime_capabilities.supports_guest_exec:
self._ensure_workspace_guest_agent_support(recreated)
with self._lock:
self._start_instance_locked(recreated)
self._require_guest_exec_or_opt_in(recreated)
reset_summary = self._backend.import_archive(
recreated,
archive_path=archive_path,
destination=WORKSPACE_GUEST_PATH,
)
workspace = self._load_workspace_locked(workspace_id)
workspace.state = recreated.state
workspace.firecracker_pid = recreated.firecracker_pid
workspace.last_error = recreated.last_error
workspace.metadata = dict(recreated.metadata)
workspace.command_count = 0
workspace.last_command = None
workspace.reset_count += 1
workspace.last_reset_at = time.time()
self._save_workspace_locked(workspace)
payload = self._serialize_workspace(workspace)
except Exception:
try:
if recreated is not None and recreated.state == "started":
self._backend.stop(recreated)
except Exception:
pass
try:
if recreated is not None:
self._backend.delete(recreated)
except Exception:
pass
with self._lock:
workspace = self._load_workspace_locked(workspace_id)
workspace.state = "stopped"
workspace.firecracker_pid = None
workspace.last_error = None
self._save_workspace_locked(workspace)
raise
payload["workspace_reset"] = {
"snapshot_name": selected_snapshot.snapshot_name,
"kind": selected_snapshot.kind,
"destination": str(reset_summary["destination"]),
"entry_count": int(reset_summary["entry_count"]),
"bytes_written": int(reset_summary["bytes_written"]),
}
return payload
def exec_workspace(
self,
workspace_id: str,
@ -3372,6 +3627,8 @@ class VmManager:
"workspace_seed": _workspace_seed_dict(workspace.workspace_seed),
"command_count": workspace.command_count,
"last_command": workspace.last_command,
"reset_count": workspace.reset_count,
"last_reset_at": workspace.last_reset_at,
"service_count": service_count,
"running_service_count": running_service_count,
"metadata": workspace.metadata,
@ -3408,6 +3665,17 @@ class VmManager:
"stop_reason": service.stop_reason,
}
def _serialize_workspace_snapshot(self, snapshot: WorkspaceSnapshotRecord) -> dict[str, Any]:
return {
"workspace_id": snapshot.workspace_id,
"snapshot_name": snapshot.snapshot_name,
"kind": snapshot.kind,
"created_at": snapshot.created_at,
"entry_count": snapshot.entry_count,
"bytes_written": snapshot.bytes_written,
"deletable": snapshot.kind != "baseline",
}
def _require_guest_boot_or_opt_in(self, instance: VmInstance) -> None:
if self._runtime_capabilities.supports_vm_boot or instance.allow_host_compat:
return
@ -3589,6 +3857,15 @@ class VmManager:
def _workspace_baseline_archive_path(self, workspace_id: str) -> Path:
return self._workspace_baseline_dir(workspace_id) / WORKSPACE_BASELINE_ARCHIVE_NAME
def _workspace_snapshots_dir(self, workspace_id: str) -> Path:
return self._workspace_dir(workspace_id) / WORKSPACE_SNAPSHOTS_DIRNAME
def _workspace_snapshot_archive_path(self, workspace_id: str, snapshot_name: str) -> Path:
return self._workspace_snapshots_dir(workspace_id) / f"{snapshot_name}.tar"
def _workspace_snapshot_metadata_path(self, workspace_id: str, snapshot_name: str) -> Path:
return self._workspace_snapshots_dir(workspace_id) / f"{snapshot_name}.json"
def _workspace_commands_dir(self, workspace_id: str) -> Path:
return self._workspace_dir(workspace_id) / WORKSPACE_COMMANDS_DIRNAME
@ -3846,6 +4123,41 @@ class VmManager:
services = self._list_workspace_services_locked(workspace_id)
return len(services), sum(1 for service in services if service.state == "running")
def _workspace_baseline_snapshot_locked(
self,
workspace: WorkspaceRecord,
) -> WorkspaceSnapshotRecord:
baseline_archive_path = self._workspace_baseline_archive_path(workspace.workspace_id)
if not baseline_archive_path.exists():
raise RuntimeError(
"workspace snapshots and reset require a baseline snapshot. "
"Recreate the workspace to use snapshot/reset features."
)
entry_count, bytes_written = _inspect_seed_archive(baseline_archive_path)
return WorkspaceSnapshotRecord(
workspace_id=workspace.workspace_id,
snapshot_name="baseline",
kind="baseline",
created_at=workspace.created_at,
entry_count=entry_count,
bytes_written=bytes_written,
)
def _resolve_workspace_snapshot_locked(
self,
workspace: WorkspaceRecord,
snapshot_name: str,
) -> tuple[WorkspaceSnapshotRecord, Path]:
normalized_name = _normalize_workspace_snapshot_name(snapshot_name, allow_baseline=True)
if normalized_name == "baseline":
baseline = self._workspace_baseline_snapshot_locked(workspace)
return baseline, self._workspace_baseline_archive_path(workspace.workspace_id)
snapshot = self._load_workspace_snapshot_locked(workspace.workspace_id, normalized_name)
return (
snapshot,
self._workspace_snapshot_archive_path(workspace.workspace_id, normalized_name),
)
def _load_workspace_service_locked(
self,
workspace_id: str,
@ -3861,6 +4173,34 @@ class VmManager:
raise RuntimeError(f"service record at {record_path} is invalid")
return WorkspaceServiceRecord.from_payload(payload)
def _load_workspace_snapshot_locked(
self,
workspace_id: str,
snapshot_name: str,
) -> WorkspaceSnapshotRecord:
record_path = self._workspace_snapshot_metadata_path(workspace_id, snapshot_name)
if not record_path.exists():
raise ValueError(
f"snapshot {snapshot_name!r} does not exist in workspace {workspace_id!r}"
)
payload = json.loads(record_path.read_text(encoding="utf-8"))
if not isinstance(payload, dict):
raise RuntimeError(f"snapshot record at {record_path} is invalid")
return WorkspaceSnapshotRecord.from_payload(payload)
def _load_workspace_snapshot_locked_optional(
self,
workspace_id: str,
snapshot_name: str,
) -> WorkspaceSnapshotRecord | None:
record_path = self._workspace_snapshot_metadata_path(workspace_id, snapshot_name)
if not record_path.exists():
return None
payload = json.loads(record_path.read_text(encoding="utf-8"))
if not isinstance(payload, dict):
raise RuntimeError(f"snapshot record at {record_path} is invalid")
return WorkspaceSnapshotRecord.from_payload(payload)
def _load_workspace_service_locked_optional(
self,
workspace_id: str,
@ -3885,6 +4225,17 @@ class VmManager:
encoding="utf-8",
)
def _save_workspace_snapshot_locked(self, snapshot: WorkspaceSnapshotRecord) -> None:
record_path = self._workspace_snapshot_metadata_path(
snapshot.workspace_id,
snapshot.snapshot_name,
)
record_path.parent.mkdir(parents=True, exist_ok=True)
record_path.write_text(
json.dumps(snapshot.to_payload(), indent=2, sort_keys=True),
encoding="utf-8",
)
def _delete_workspace_service_artifacts_locked(
self,
workspace_id: str,
@ -3897,6 +4248,10 @@ class VmManager:
_workspace_service_status_path(services_dir, service_name).unlink(missing_ok=True)
_workspace_service_runner_path(services_dir, service_name).unlink(missing_ok=True)
def _delete_workspace_snapshot_locked(self, workspace_id: str, snapshot_name: str) -> None:
self._workspace_snapshot_metadata_path(workspace_id, snapshot_name).unlink(missing_ok=True)
self._workspace_snapshot_archive_path(workspace_id, snapshot_name).unlink(missing_ok=True)
def _list_workspace_services_locked(self, workspace_id: str) -> list[WorkspaceServiceRecord]:
services_dir = self._workspace_services_dir(workspace_id)
if not services_dir.exists():
@ -3909,6 +4264,26 @@ class VmManager:
services.append(WorkspaceServiceRecord.from_payload(payload))
return services
def _list_workspace_snapshots_locked(
self,
workspace: WorkspaceRecord,
) -> list[WorkspaceSnapshotRecord]:
snapshots_dir = self._workspace_snapshots_dir(workspace.workspace_id)
snapshots: list[WorkspaceSnapshotRecord] = [
self._workspace_baseline_snapshot_locked(workspace)
]
if not snapshots_dir.exists():
return snapshots
named_snapshots: list[WorkspaceSnapshotRecord] = []
for record_path in snapshots_dir.glob("*.json"):
payload = json.loads(record_path.read_text(encoding="utf-8"))
if not isinstance(payload, dict):
continue
named_snapshots.append(WorkspaceSnapshotRecord.from_payload(payload))
named_snapshots.sort(key=lambda item: (-item.created_at, item.snapshot_name))
snapshots.extend(named_snapshots)
return snapshots
def _save_workspace_shell_locked(self, shell: WorkspaceShellRecord) -> None:
record_path = self._workspace_shell_record_path(shell.workspace_id, shell.shell_id)
record_path.parent.mkdir(parents=True, exist_ok=True)
@ -3950,6 +4325,17 @@ class VmManager:
pass
self._delete_workspace_shell_locked(workspace.workspace_id, shell.shell_id)
def _reset_workspace_runtime_dirs(self, workspace_id: str) -> None:
shutil.rmtree(self._workspace_runtime_dir(workspace_id), ignore_errors=True)
shutil.rmtree(self._workspace_host_dir(workspace_id), ignore_errors=True)
shutil.rmtree(self._workspace_commands_dir(workspace_id), ignore_errors=True)
shutil.rmtree(self._workspace_shells_dir(workspace_id), ignore_errors=True)
shutil.rmtree(self._workspace_services_dir(workspace_id), ignore_errors=True)
self._workspace_host_dir(workspace_id).mkdir(parents=True, exist_ok=True)
self._workspace_commands_dir(workspace_id).mkdir(parents=True, exist_ok=True)
self._workspace_shells_dir(workspace_id).mkdir(parents=True, exist_ok=True)
self._workspace_services_dir(workspace_id).mkdir(parents=True, exist_ok=True)
def _refresh_workspace_service_locked(
self,
workspace: WorkspaceRecord,