Add stopped-workspace disk export and inspection

Finish the 3.1.0 secondary disk-tools milestone so stable workspaces can be
stopped, inspected offline, exported as raw ext4 images, and started again
without changing the primary workspace-first interaction model.

Add workspace stop/start plus workspace disk export/list/read across the CLI,
SDK, and MCP, backed by a new offline debugfs inspection helper and guest-only
validation. Scrub runtime-only guest state before disk inspection/export, and
fix the real guest reliability gaps by flushing the filesystem on stop and
removing stale Firecracker socket files before restart.

Update the docs, examples, changelog, and roadmap to mark 3.1.0 done, and
cover the new lifecycle/disk paths with API, CLI, manager, contract, and
package-surface tests.

Validation: uv lock; UV_CACHE_DIR=.uv-cache make check; UV_CACHE_DIR=.uv-cache
make dist-check; real guest-backed smoke for create, shell/service activity,
stop, workspace disk list/read/export, start, exec, and delete.
This commit is contained in:
Thales Maciel 2026-03-12 20:57:16 -03:00
parent f2d20ef30a
commit 287f6d100f
26 changed files with 2585 additions and 34 deletions

View file

@ -34,6 +34,12 @@ from pyro_mcp.vm_environments import EnvironmentStore, default_cache_dir, get_en
from pyro_mcp.vm_firecracker import build_launch_plan
from pyro_mcp.vm_guest import VsockExecClient
from pyro_mcp.vm_network import NetworkConfig, TapNetworkManager
from pyro_mcp.workspace_disk import (
export_workspace_disk_image,
list_workspace_disk,
read_workspace_disk_file,
scrub_workspace_runtime_paths,
)
from pyro_mcp.workspace_ports import DEFAULT_PUBLISHED_PORT_HOST
from pyro_mcp.workspace_shells import (
create_local_shell,
@ -72,6 +78,7 @@ WORKSPACE_SECRET_MAX_BYTES = 64 * 1024
DEFAULT_SHELL_COLS = 120
DEFAULT_SHELL_ROWS = 30
DEFAULT_SHELL_MAX_CHARS = 65536
DEFAULT_WORKSPACE_DISK_READ_MAX_BYTES = 65536
DEFAULT_SERVICE_READY_TIMEOUT_SECONDS = 30
DEFAULT_SERVICE_READY_INTERVAL_MS = 500
DEFAULT_SERVICE_LOG_TAIL_LINES = 200
@ -789,6 +796,28 @@ def _workspace_host_destination(workspace_dir: Path, destination: str) -> Path:
return workspace_dir.joinpath(*suffix.parts)
def _normalize_workspace_disk_path(path: str) -> str:
candidate = path.strip()
if candidate == "":
raise ValueError("workspace disk path must not be empty")
if candidate.startswith("/"):
raw_path = PurePosixPath(candidate)
normalized_parts: list[str] = []
for part in raw_path.parts:
if part in {"", "/", "."}:
continue
if part == "..":
if normalized_parts:
normalized_parts.pop()
continue
normalized_parts.append(part)
if not normalized_parts:
return "/"
return str(PurePosixPath("/") / PurePosixPath(*normalized_parts))
normalized, _ = _normalize_workspace_destination(candidate)
return normalized
def _normalize_archive_member_name(name: str) -> PurePosixPath:
candidate = name.strip()
if candidate == "":
@ -2480,6 +2509,11 @@ class FirecrackerBackend(VmBackend): # pragma: no cover
def start(self, instance: VmInstance) -> None:
launch_plan = build_launch_plan(instance)
for stale_socket_path in (
launch_plan.api_socket_path,
instance.workdir / "vsock.sock",
):
stale_socket_path.unlink(missing_ok=True)
instance.metadata["firecracker_config_path"] = str(launch_plan.config_path)
instance.metadata["guest_network_path"] = str(launch_plan.guest_network_path)
instance.metadata["guest_exec_path"] = str(launch_plan.guest_exec_path)
@ -4309,6 +4343,159 @@ class VmManager:
"entries": redacted_entries,
}
def stop_workspace(self, workspace_id: str) -> dict[str, Any]:
with self._lock:
workspace = self._load_workspace_locked(workspace_id)
self._ensure_workspace_not_expired_locked(workspace, time.time())
self._refresh_workspace_liveness_locked(workspace)
instance = workspace.to_instance(
workdir=self._workspace_runtime_dir(workspace.workspace_id)
)
try:
self._stop_workspace_services_locked(workspace, instance)
self._close_workspace_shells_locked(workspace, instance)
self._flush_workspace_filesystem_locked(workspace, instance)
if workspace.state == "started":
self._backend.stop(instance)
workspace.state = "stopped"
workspace.firecracker_pid = None
workspace.last_error = None
workspace.metadata = dict(instance.metadata)
self._scrub_workspace_runtime_state_locked(workspace)
except Exception as exc:
workspace.state = "stopped"
workspace.firecracker_pid = None
workspace.last_error = str(exc)
workspace.metadata = dict(instance.metadata)
self._save_workspace_locked(workspace)
raise
self._save_workspace_locked(workspace)
return self._serialize_workspace(workspace)
def start_workspace(self, workspace_id: str) -> dict[str, Any]:
with self._lock:
workspace = self._load_workspace_locked(workspace_id)
self._ensure_workspace_not_expired_locked(workspace, time.time())
self._refresh_workspace_liveness_locked(workspace)
if workspace.state == "started":
self._refresh_workspace_service_counts_locked(workspace)
self._save_workspace_locked(workspace)
return self._serialize_workspace(workspace)
instance = workspace.to_instance(
workdir=self._workspace_runtime_dir(workspace.workspace_id)
)
self._stop_workspace_services_locked(workspace, instance)
self._close_workspace_shells_locked(workspace, instance)
try:
self._require_workspace_network_policy_support(
network_policy=workspace.network_policy
)
if self._runtime_capabilities.supports_guest_exec:
self._ensure_workspace_guest_bootstrap_support(instance)
with self._lock:
self._start_instance_locked(instance)
workspace = self._load_workspace_locked(workspace_id)
if workspace.secrets:
self._install_workspace_secrets_locked(workspace, instance)
workspace.state = instance.state
workspace.firecracker_pid = instance.firecracker_pid
workspace.last_error = None
workspace.metadata = dict(instance.metadata)
self._save_workspace_locked(workspace)
return self._serialize_workspace(workspace)
except Exception as exc:
try:
if instance.state == "started":
self._backend.stop(instance)
except Exception:
pass
with self._lock:
workspace = self._load_workspace_locked(workspace_id)
workspace.state = "stopped"
workspace.firecracker_pid = None
workspace.last_error = str(exc)
workspace.metadata = dict(instance.metadata)
self._save_workspace_locked(workspace)
raise
def export_workspace_disk(
self,
workspace_id: str,
*,
output_path: str | Path,
) -> dict[str, Any]:
raw_output_path = str(output_path).strip()
if raw_output_path == "":
raise ValueError("output_path must not be empty")
resolved_output_path = Path(output_path).expanduser().resolve()
with self._lock:
workspace = self._load_workspace_locked(workspace_id)
rootfs_path = self._workspace_stopped_disk_rootfs_locked(
workspace,
operation_name="workspace_disk_export",
)
self._scrub_workspace_runtime_state_locked(workspace, rootfs_path=rootfs_path)
self._save_workspace_locked(workspace)
exported = export_workspace_disk_image(rootfs_path, output_path=resolved_output_path)
return {
"workspace_id": workspace_id,
"output_path": str(Path(str(exported["output_path"]))),
"disk_format": str(exported["disk_format"]),
"bytes_written": int(exported["bytes_written"]),
}
def list_workspace_disk(
self,
workspace_id: str,
*,
path: str = WORKSPACE_GUEST_PATH,
recursive: bool = False,
) -> dict[str, Any]:
normalized_path = _normalize_workspace_disk_path(path)
with self._lock:
workspace = self._load_workspace_locked(workspace_id)
rootfs_path = self._workspace_stopped_disk_rootfs_locked(
workspace,
operation_name="workspace_disk_list",
)
self._scrub_workspace_runtime_state_locked(workspace, rootfs_path=rootfs_path)
self._save_workspace_locked(workspace)
entries = list_workspace_disk(
rootfs_path,
guest_path=normalized_path,
recursive=recursive,
)
return {
"workspace_id": workspace_id,
"path": normalized_path,
"recursive": recursive,
"entries": entries,
}
def read_workspace_disk(
self,
workspace_id: str,
*,
path: str,
max_bytes: int = DEFAULT_WORKSPACE_DISK_READ_MAX_BYTES,
) -> dict[str, Any]:
normalized_path = _normalize_workspace_disk_path(path)
with self._lock:
workspace = self._load_workspace_locked(workspace_id)
rootfs_path = self._workspace_stopped_disk_rootfs_locked(
workspace,
operation_name="workspace_disk_read",
)
self._scrub_workspace_runtime_state_locked(workspace, rootfs_path=rootfs_path)
self._save_workspace_locked(workspace)
payload = read_workspace_disk_file(
rootfs_path,
guest_path=normalized_path,
max_bytes=max_bytes,
)
payload["workspace_id"] = workspace_id
return payload
def delete_workspace(
self,
workspace_id: str,
@ -4748,6 +4935,67 @@ class VmManager:
def _workspace_service_record_path(self, workspace_id: str, service_name: str) -> Path:
return self._workspace_services_dir(workspace_id) / f"{service_name}.json"
def _workspace_rootfs_image_path_locked(
self,
workspace: WorkspaceRecord,
) -> Path:
raw_rootfs_image = workspace.metadata.get("rootfs_image")
if raw_rootfs_image is None or raw_rootfs_image == "":
raise RuntimeError(
f"workspace {workspace.workspace_id!r} does not have a persisted rootfs image"
)
rootfs_path = Path(raw_rootfs_image)
if not rootfs_path.exists():
raise RuntimeError(
f"workspace {workspace.workspace_id!r} rootfs image is unavailable at "
f"{rootfs_path}"
)
return rootfs_path
def _workspace_stopped_disk_rootfs_locked(
self,
workspace: WorkspaceRecord,
*,
operation_name: str,
) -> Path:
self._ensure_workspace_not_expired_locked(workspace, time.time())
self._refresh_workspace_liveness_locked(workspace)
if workspace.state != "stopped":
raise RuntimeError(
f"workspace {workspace.workspace_id!r} must be stopped before {operation_name}"
)
if workspace.metadata.get("execution_mode") == "host_compat":
raise RuntimeError(
f"{operation_name} is unavailable for host_compat workspaces"
)
return self._workspace_rootfs_image_path_locked(workspace)
def _scrub_workspace_runtime_state_locked(
self,
workspace: WorkspaceRecord,
*,
rootfs_path: Path | None = None,
) -> None:
execution_mode = workspace.metadata.get("execution_mode")
if execution_mode == "host_compat":
return
scrub_workspace_runtime_paths(
rootfs_path or self._workspace_rootfs_image_path_locked(workspace)
)
def _flush_workspace_filesystem_locked(
self,
workspace: WorkspaceRecord,
instance: VmInstance,
) -> None:
if workspace.state != "started":
return
if self._backend_name == "mock":
return
if not self._runtime_capabilities.supports_guest_exec:
return
self._backend.exec(instance, "sync", 10)
def _count_workspaces_locked(self) -> int:
return sum(1 for _ in self._workspaces_dir.glob("*/workspace.json"))