Add stopped-workspace disk export and inspection
Finish the 3.1.0 secondary disk-tools milestone so stable workspaces can be stopped, inspected offline, exported as raw ext4 images, and started again without changing the primary workspace-first interaction model. Add workspace stop/start plus workspace disk export/list/read across the CLI, SDK, and MCP, backed by a new offline debugfs inspection helper and guest-only validation. Scrub runtime-only guest state before disk inspection/export, and fix the real guest reliability gaps by flushing the filesystem on stop and removing stale Firecracker socket files before restart. Update the docs, examples, changelog, and roadmap to mark 3.1.0 done, and cover the new lifecycle/disk paths with API, CLI, manager, contract, and package-surface tests. Validation: uv lock; UV_CACHE_DIR=.uv-cache make check; UV_CACHE_DIR=.uv-cache make dist-check; real guest-backed smoke for create, shell/service activity, stop, workspace disk list/read/export, start, exec, and delete.
This commit is contained in:
parent
f2d20ef30a
commit
287f6d100f
26 changed files with 2585 additions and 34 deletions
|
|
@ -34,6 +34,12 @@ from pyro_mcp.vm_environments import EnvironmentStore, default_cache_dir, get_en
|
|||
from pyro_mcp.vm_firecracker import build_launch_plan
|
||||
from pyro_mcp.vm_guest import VsockExecClient
|
||||
from pyro_mcp.vm_network import NetworkConfig, TapNetworkManager
|
||||
from pyro_mcp.workspace_disk import (
|
||||
export_workspace_disk_image,
|
||||
list_workspace_disk,
|
||||
read_workspace_disk_file,
|
||||
scrub_workspace_runtime_paths,
|
||||
)
|
||||
from pyro_mcp.workspace_ports import DEFAULT_PUBLISHED_PORT_HOST
|
||||
from pyro_mcp.workspace_shells import (
|
||||
create_local_shell,
|
||||
|
|
@ -72,6 +78,7 @@ WORKSPACE_SECRET_MAX_BYTES = 64 * 1024
|
|||
DEFAULT_SHELL_COLS = 120
|
||||
DEFAULT_SHELL_ROWS = 30
|
||||
DEFAULT_SHELL_MAX_CHARS = 65536
|
||||
DEFAULT_WORKSPACE_DISK_READ_MAX_BYTES = 65536
|
||||
DEFAULT_SERVICE_READY_TIMEOUT_SECONDS = 30
|
||||
DEFAULT_SERVICE_READY_INTERVAL_MS = 500
|
||||
DEFAULT_SERVICE_LOG_TAIL_LINES = 200
|
||||
|
|
@ -789,6 +796,28 @@ def _workspace_host_destination(workspace_dir: Path, destination: str) -> Path:
|
|||
return workspace_dir.joinpath(*suffix.parts)
|
||||
|
||||
|
||||
def _normalize_workspace_disk_path(path: str) -> str:
|
||||
candidate = path.strip()
|
||||
if candidate == "":
|
||||
raise ValueError("workspace disk path must not be empty")
|
||||
if candidate.startswith("/"):
|
||||
raw_path = PurePosixPath(candidate)
|
||||
normalized_parts: list[str] = []
|
||||
for part in raw_path.parts:
|
||||
if part in {"", "/", "."}:
|
||||
continue
|
||||
if part == "..":
|
||||
if normalized_parts:
|
||||
normalized_parts.pop()
|
||||
continue
|
||||
normalized_parts.append(part)
|
||||
if not normalized_parts:
|
||||
return "/"
|
||||
return str(PurePosixPath("/") / PurePosixPath(*normalized_parts))
|
||||
normalized, _ = _normalize_workspace_destination(candidate)
|
||||
return normalized
|
||||
|
||||
|
||||
def _normalize_archive_member_name(name: str) -> PurePosixPath:
|
||||
candidate = name.strip()
|
||||
if candidate == "":
|
||||
|
|
@ -2480,6 +2509,11 @@ class FirecrackerBackend(VmBackend): # pragma: no cover
|
|||
|
||||
def start(self, instance: VmInstance) -> None:
|
||||
launch_plan = build_launch_plan(instance)
|
||||
for stale_socket_path in (
|
||||
launch_plan.api_socket_path,
|
||||
instance.workdir / "vsock.sock",
|
||||
):
|
||||
stale_socket_path.unlink(missing_ok=True)
|
||||
instance.metadata["firecracker_config_path"] = str(launch_plan.config_path)
|
||||
instance.metadata["guest_network_path"] = str(launch_plan.guest_network_path)
|
||||
instance.metadata["guest_exec_path"] = str(launch_plan.guest_exec_path)
|
||||
|
|
@ -4309,6 +4343,159 @@ class VmManager:
|
|||
"entries": redacted_entries,
|
||||
}
|
||||
|
||||
def stop_workspace(self, workspace_id: str) -> dict[str, Any]:
|
||||
with self._lock:
|
||||
workspace = self._load_workspace_locked(workspace_id)
|
||||
self._ensure_workspace_not_expired_locked(workspace, time.time())
|
||||
self._refresh_workspace_liveness_locked(workspace)
|
||||
instance = workspace.to_instance(
|
||||
workdir=self._workspace_runtime_dir(workspace.workspace_id)
|
||||
)
|
||||
try:
|
||||
self._stop_workspace_services_locked(workspace, instance)
|
||||
self._close_workspace_shells_locked(workspace, instance)
|
||||
self._flush_workspace_filesystem_locked(workspace, instance)
|
||||
if workspace.state == "started":
|
||||
self._backend.stop(instance)
|
||||
workspace.state = "stopped"
|
||||
workspace.firecracker_pid = None
|
||||
workspace.last_error = None
|
||||
workspace.metadata = dict(instance.metadata)
|
||||
self._scrub_workspace_runtime_state_locked(workspace)
|
||||
except Exception as exc:
|
||||
workspace.state = "stopped"
|
||||
workspace.firecracker_pid = None
|
||||
workspace.last_error = str(exc)
|
||||
workspace.metadata = dict(instance.metadata)
|
||||
self._save_workspace_locked(workspace)
|
||||
raise
|
||||
self._save_workspace_locked(workspace)
|
||||
return self._serialize_workspace(workspace)
|
||||
|
||||
def start_workspace(self, workspace_id: str) -> dict[str, Any]:
|
||||
with self._lock:
|
||||
workspace = self._load_workspace_locked(workspace_id)
|
||||
self._ensure_workspace_not_expired_locked(workspace, time.time())
|
||||
self._refresh_workspace_liveness_locked(workspace)
|
||||
if workspace.state == "started":
|
||||
self._refresh_workspace_service_counts_locked(workspace)
|
||||
self._save_workspace_locked(workspace)
|
||||
return self._serialize_workspace(workspace)
|
||||
instance = workspace.to_instance(
|
||||
workdir=self._workspace_runtime_dir(workspace.workspace_id)
|
||||
)
|
||||
self._stop_workspace_services_locked(workspace, instance)
|
||||
self._close_workspace_shells_locked(workspace, instance)
|
||||
try:
|
||||
self._require_workspace_network_policy_support(
|
||||
network_policy=workspace.network_policy
|
||||
)
|
||||
if self._runtime_capabilities.supports_guest_exec:
|
||||
self._ensure_workspace_guest_bootstrap_support(instance)
|
||||
with self._lock:
|
||||
self._start_instance_locked(instance)
|
||||
workspace = self._load_workspace_locked(workspace_id)
|
||||
if workspace.secrets:
|
||||
self._install_workspace_secrets_locked(workspace, instance)
|
||||
workspace.state = instance.state
|
||||
workspace.firecracker_pid = instance.firecracker_pid
|
||||
workspace.last_error = None
|
||||
workspace.metadata = dict(instance.metadata)
|
||||
self._save_workspace_locked(workspace)
|
||||
return self._serialize_workspace(workspace)
|
||||
except Exception as exc:
|
||||
try:
|
||||
if instance.state == "started":
|
||||
self._backend.stop(instance)
|
||||
except Exception:
|
||||
pass
|
||||
with self._lock:
|
||||
workspace = self._load_workspace_locked(workspace_id)
|
||||
workspace.state = "stopped"
|
||||
workspace.firecracker_pid = None
|
||||
workspace.last_error = str(exc)
|
||||
workspace.metadata = dict(instance.metadata)
|
||||
self._save_workspace_locked(workspace)
|
||||
raise
|
||||
|
||||
def export_workspace_disk(
|
||||
self,
|
||||
workspace_id: str,
|
||||
*,
|
||||
output_path: str | Path,
|
||||
) -> dict[str, Any]:
|
||||
raw_output_path = str(output_path).strip()
|
||||
if raw_output_path == "":
|
||||
raise ValueError("output_path must not be empty")
|
||||
resolved_output_path = Path(output_path).expanduser().resolve()
|
||||
with self._lock:
|
||||
workspace = self._load_workspace_locked(workspace_id)
|
||||
rootfs_path = self._workspace_stopped_disk_rootfs_locked(
|
||||
workspace,
|
||||
operation_name="workspace_disk_export",
|
||||
)
|
||||
self._scrub_workspace_runtime_state_locked(workspace, rootfs_path=rootfs_path)
|
||||
self._save_workspace_locked(workspace)
|
||||
exported = export_workspace_disk_image(rootfs_path, output_path=resolved_output_path)
|
||||
return {
|
||||
"workspace_id": workspace_id,
|
||||
"output_path": str(Path(str(exported["output_path"]))),
|
||||
"disk_format": str(exported["disk_format"]),
|
||||
"bytes_written": int(exported["bytes_written"]),
|
||||
}
|
||||
|
||||
def list_workspace_disk(
|
||||
self,
|
||||
workspace_id: str,
|
||||
*,
|
||||
path: str = WORKSPACE_GUEST_PATH,
|
||||
recursive: bool = False,
|
||||
) -> dict[str, Any]:
|
||||
normalized_path = _normalize_workspace_disk_path(path)
|
||||
with self._lock:
|
||||
workspace = self._load_workspace_locked(workspace_id)
|
||||
rootfs_path = self._workspace_stopped_disk_rootfs_locked(
|
||||
workspace,
|
||||
operation_name="workspace_disk_list",
|
||||
)
|
||||
self._scrub_workspace_runtime_state_locked(workspace, rootfs_path=rootfs_path)
|
||||
self._save_workspace_locked(workspace)
|
||||
entries = list_workspace_disk(
|
||||
rootfs_path,
|
||||
guest_path=normalized_path,
|
||||
recursive=recursive,
|
||||
)
|
||||
return {
|
||||
"workspace_id": workspace_id,
|
||||
"path": normalized_path,
|
||||
"recursive": recursive,
|
||||
"entries": entries,
|
||||
}
|
||||
|
||||
def read_workspace_disk(
|
||||
self,
|
||||
workspace_id: str,
|
||||
*,
|
||||
path: str,
|
||||
max_bytes: int = DEFAULT_WORKSPACE_DISK_READ_MAX_BYTES,
|
||||
) -> dict[str, Any]:
|
||||
normalized_path = _normalize_workspace_disk_path(path)
|
||||
with self._lock:
|
||||
workspace = self._load_workspace_locked(workspace_id)
|
||||
rootfs_path = self._workspace_stopped_disk_rootfs_locked(
|
||||
workspace,
|
||||
operation_name="workspace_disk_read",
|
||||
)
|
||||
self._scrub_workspace_runtime_state_locked(workspace, rootfs_path=rootfs_path)
|
||||
self._save_workspace_locked(workspace)
|
||||
payload = read_workspace_disk_file(
|
||||
rootfs_path,
|
||||
guest_path=normalized_path,
|
||||
max_bytes=max_bytes,
|
||||
)
|
||||
payload["workspace_id"] = workspace_id
|
||||
return payload
|
||||
|
||||
def delete_workspace(
|
||||
self,
|
||||
workspace_id: str,
|
||||
|
|
@ -4748,6 +4935,67 @@ class VmManager:
|
|||
def _workspace_service_record_path(self, workspace_id: str, service_name: str) -> Path:
|
||||
return self._workspace_services_dir(workspace_id) / f"{service_name}.json"
|
||||
|
||||
def _workspace_rootfs_image_path_locked(
|
||||
self,
|
||||
workspace: WorkspaceRecord,
|
||||
) -> Path:
|
||||
raw_rootfs_image = workspace.metadata.get("rootfs_image")
|
||||
if raw_rootfs_image is None or raw_rootfs_image == "":
|
||||
raise RuntimeError(
|
||||
f"workspace {workspace.workspace_id!r} does not have a persisted rootfs image"
|
||||
)
|
||||
rootfs_path = Path(raw_rootfs_image)
|
||||
if not rootfs_path.exists():
|
||||
raise RuntimeError(
|
||||
f"workspace {workspace.workspace_id!r} rootfs image is unavailable at "
|
||||
f"{rootfs_path}"
|
||||
)
|
||||
return rootfs_path
|
||||
|
||||
def _workspace_stopped_disk_rootfs_locked(
|
||||
self,
|
||||
workspace: WorkspaceRecord,
|
||||
*,
|
||||
operation_name: str,
|
||||
) -> Path:
|
||||
self._ensure_workspace_not_expired_locked(workspace, time.time())
|
||||
self._refresh_workspace_liveness_locked(workspace)
|
||||
if workspace.state != "stopped":
|
||||
raise RuntimeError(
|
||||
f"workspace {workspace.workspace_id!r} must be stopped before {operation_name}"
|
||||
)
|
||||
if workspace.metadata.get("execution_mode") == "host_compat":
|
||||
raise RuntimeError(
|
||||
f"{operation_name} is unavailable for host_compat workspaces"
|
||||
)
|
||||
return self._workspace_rootfs_image_path_locked(workspace)
|
||||
|
||||
def _scrub_workspace_runtime_state_locked(
|
||||
self,
|
||||
workspace: WorkspaceRecord,
|
||||
*,
|
||||
rootfs_path: Path | None = None,
|
||||
) -> None:
|
||||
execution_mode = workspace.metadata.get("execution_mode")
|
||||
if execution_mode == "host_compat":
|
||||
return
|
||||
scrub_workspace_runtime_paths(
|
||||
rootfs_path or self._workspace_rootfs_image_path_locked(workspace)
|
||||
)
|
||||
|
||||
def _flush_workspace_filesystem_locked(
|
||||
self,
|
||||
workspace: WorkspaceRecord,
|
||||
instance: VmInstance,
|
||||
) -> None:
|
||||
if workspace.state != "started":
|
||||
return
|
||||
if self._backend_name == "mock":
|
||||
return
|
||||
if not self._runtime_capabilities.supports_guest_exec:
|
||||
return
|
||||
self._backend.exec(instance, "sync", 10)
|
||||
|
||||
def _count_workspaces_locked(self) -> int:
|
||||
return sum(1 for _ in self._workspaces_dir.glob("*/workspace.json"))
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue