Add workspace service lifecycle with typed readiness

Make persistent workspaces capable of running long-lived background processes instead of forcing everything through one-shot exec calls.

Add workspace service start/list/status/logs/stop across the CLI, Python SDK, and MCP server, with multiple named services per workspace, typed readiness probes (file, tcp, http, and command), and aggregate service counts on workspace status. Keep service state and logs outside /workspace so diff and export semantics stay workspace-scoped, and extend the guest agent plus backends to persist service records and logs across separate calls.

Update the 2.7.0 docs, examples, changelog, and roadmap milestone to reflect the shipped surface.

Validation: uv lock; UV_CACHE_DIR=.uv-cache make check; UV_CACHE_DIR=.uv-cache make dist-check; real guest-backed Firecracker smoke for workspace create, two service starts, list/status/logs, diff unaffected, stop, and delete.
This commit is contained in:
Thales Maciel 2026-03-12 05:36:28 -03:00
parent 84a7e18d4d
commit f504f0a331
28 changed files with 4098 additions and 124 deletions

View file

@ -3,6 +3,7 @@ from __future__ import annotations
import io
import json
import os
import signal
import subprocess
import tarfile
import time
@ -1144,3 +1145,369 @@ def test_reap_expired_workspaces_removes_invalid_and_expired_records(tmp_path: P
assert not invalid_dir.exists()
assert not (tmp_path / "vms" / "workspaces" / workspace_id).exists()
def test_workspace_service_lifecycle_and_status_counts(tmp_path: Path) -> None:
manager = VmManager(
backend_name="mock",
base_dir=tmp_path / "vms",
network_manager=TapNetworkManager(enabled=False),
)
workspace_id = str(
manager.create_workspace(
environment="debian:12-base",
allow_host_compat=True,
)["workspace_id"]
)
started = manager.start_service(
workspace_id,
"app",
command="sh -lc 'printf \"service ready\\n\"; touch .ready; while true; do sleep 60; done'",
readiness={"type": "file", "path": ".ready"},
)
assert started["state"] == "running"
listed = manager.list_services(workspace_id)
assert listed["count"] == 1
assert listed["running_count"] == 1
status = manager.status_service(workspace_id, "app")
assert status["state"] == "running"
assert status["ready_at"] is not None
logs = manager.logs_service(workspace_id, "app")
assert "service ready" in str(logs["stdout"])
workspace_status = manager.status_workspace(workspace_id)
assert workspace_status["service_count"] == 1
assert workspace_status["running_service_count"] == 1
stopped = manager.stop_service(workspace_id, "app")
assert stopped["state"] == "stopped"
assert stopped["stop_reason"] in {"sigterm", "sigkill"}
deleted = manager.delete_workspace(workspace_id)
assert deleted["deleted"] is True
def test_workspace_service_start_replaces_non_running_record(tmp_path: Path) -> None:
manager = VmManager(
backend_name="mock",
base_dir=tmp_path / "vms",
network_manager=TapNetworkManager(enabled=False),
)
workspace_id = str(
manager.create_workspace(
environment="debian:12-base",
allow_host_compat=True,
)["workspace_id"]
)
failed = manager.start_service(
workspace_id,
"app",
command="sh -lc 'exit 2'",
readiness={"type": "file", "path": ".ready"},
ready_timeout_seconds=1,
ready_interval_ms=50,
)
assert failed["state"] == "failed"
started = manager.start_service(
workspace_id,
"app",
command="sh -lc 'touch .ready; while true; do sleep 60; done'",
readiness={"type": "file", "path": ".ready"},
)
assert started["state"] == "running"
manager.delete_workspace(workspace_id)
def test_workspace_service_supports_command_readiness_and_helper_probes(
tmp_path: Path, monkeypatch: pytest.MonkeyPatch
) -> None:
manager = VmManager(
backend_name="mock",
base_dir=tmp_path / "vms",
network_manager=TapNetworkManager(enabled=False),
)
workspace_id = str(
manager.create_workspace(
environment="debian:12-base",
allow_host_compat=True,
)["workspace_id"]
)
command_started = manager.start_service(
workspace_id,
"command-ready",
command="sh -lc 'touch command.ready; while true; do sleep 60; done'",
readiness={"type": "command", "command": "test -f command.ready"},
)
assert command_started["state"] == "running"
listed = manager.list_services(workspace_id)
assert listed["count"] == 1
assert listed["running_count"] == 1
status = manager.status_workspace(workspace_id)
assert status["service_count"] == 1
assert status["running_service_count"] == 1
assert manager.stop_service(workspace_id, "command-ready")["state"] == "stopped"
workspace_dir = tmp_path / "vms" / "workspaces" / workspace_id / "workspace"
ready_file = workspace_dir / "probe.ready"
ready_file.write_text("ok\n", encoding="utf-8")
assert vm_manager_module._service_ready_on_host( # noqa: SLF001
readiness={"type": "file", "path": "/workspace/probe.ready"},
workspace_dir=workspace_dir,
cwd=workspace_dir,
)
class StubSocket:
def __enter__(self) -> StubSocket:
return self
def __exit__(self, *args: object) -> None:
del args
def settimeout(self, timeout: int) -> None:
assert timeout == 1
def connect(self, address: tuple[str, int]) -> None:
assert address == ("127.0.0.1", 8080)
monkeypatch.setattr("pyro_mcp.vm_manager.socket.socket", lambda *args: StubSocket())
assert vm_manager_module._service_ready_on_host( # noqa: SLF001
readiness={"type": "tcp", "address": "127.0.0.1:8080"},
workspace_dir=workspace_dir,
cwd=workspace_dir,
)
class StubResponse:
status = 204
def __enter__(self) -> StubResponse:
return self
def __exit__(self, *args: object) -> None:
del args
def _urlopen(request: object, timeout: int) -> StubResponse:
del request
assert timeout == 2
return StubResponse()
monkeypatch.setattr("pyro_mcp.vm_manager.urllib.request.urlopen", _urlopen)
assert vm_manager_module._service_ready_on_host( # noqa: SLF001
readiness={"type": "http", "url": "http://127.0.0.1:8080/"},
workspace_dir=workspace_dir,
cwd=workspace_dir,
)
def test_workspace_service_logs_tail_and_delete_cleanup(tmp_path: Path) -> None:
manager = VmManager(
backend_name="mock",
base_dir=tmp_path / "vms",
network_manager=TapNetworkManager(enabled=False),
)
workspace_id = str(
manager.create_workspace(
environment="debian:12-base",
allow_host_compat=True,
)["workspace_id"]
)
manager.start_service(
workspace_id,
"logger",
command=(
"sh -lc 'printf \"one\\n\"; printf \"two\\n\"; "
"touch .ready; while true; do sleep 60; done'"
),
readiness={"type": "file", "path": ".ready"},
)
logs = manager.logs_service(workspace_id, "logger", tail_lines=1)
assert logs["stdout"] == "two\n"
assert logs["truncated"] is True
services_dir = tmp_path / "vms" / "workspaces" / workspace_id / "services"
assert services_dir.exists()
deleted = manager.delete_workspace(workspace_id)
assert deleted["deleted"] is True
assert not services_dir.exists()
def test_workspace_status_stops_service_counts_when_workspace_is_stopped(tmp_path: Path) -> None:
manager = VmManager(
backend_name="mock",
base_dir=tmp_path / "vms",
network_manager=TapNetworkManager(enabled=False),
)
workspace_id = str(
manager.create_workspace(
environment="debian:12-base",
allow_host_compat=True,
)["workspace_id"]
)
manager.start_service(
workspace_id,
"app",
command="sh -lc 'touch .ready; while true; do sleep 60; done'",
readiness={"type": "file", "path": ".ready"},
)
service_path = tmp_path / "vms" / "workspaces" / workspace_id / "services" / "app.json"
live_service_payload = json.loads(service_path.read_text(encoding="utf-8"))
live_pid = int(live_service_payload["pid"])
try:
workspace_path = tmp_path / "vms" / "workspaces" / workspace_id / "workspace.json"
payload = json.loads(workspace_path.read_text(encoding="utf-8"))
payload["state"] = "stopped"
workspace_path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8")
status = manager.status_workspace(workspace_id)
assert status["state"] == "stopped"
assert status["service_count"] == 1
assert status["running_service_count"] == 0
service_payload = json.loads(service_path.read_text(encoding="utf-8"))
assert service_payload["state"] == "stopped"
assert service_payload["stop_reason"] == "workspace_stopped"
finally:
vm_manager_module._stop_process_group(live_pid) # noqa: SLF001
def test_workspace_service_readiness_validation_helpers() -> None:
assert vm_manager_module._normalize_workspace_service_name("app-1") == "app-1" # noqa: SLF001
with pytest.raises(ValueError, match="service_name must not be empty"):
vm_manager_module._normalize_workspace_service_name(" ") # noqa: SLF001
with pytest.raises(ValueError, match="service_name must match"):
vm_manager_module._normalize_workspace_service_name("bad name") # noqa: SLF001
assert vm_manager_module._normalize_workspace_service_readiness( # noqa: SLF001
{"type": "file", "path": "subdir/.ready"}
) == {"type": "file", "path": "/workspace/subdir/.ready"}
assert vm_manager_module._normalize_workspace_service_readiness( # noqa: SLF001
{"type": "tcp", "address": "127.0.0.1:8080"}
) == {"type": "tcp", "address": "127.0.0.1:8080"}
assert vm_manager_module._normalize_workspace_service_readiness( # noqa: SLF001
{"type": "http", "url": "http://127.0.0.1:8080/"}
) == {"type": "http", "url": "http://127.0.0.1:8080/"}
assert vm_manager_module._normalize_workspace_service_readiness( # noqa: SLF001
{"type": "command", "command": "test -f .ready"}
) == {"type": "command", "command": "test -f .ready"}
with pytest.raises(ValueError, match="one of: file, tcp, http, command"):
vm_manager_module._normalize_workspace_service_readiness({"type": "bogus"}) # noqa: SLF001
with pytest.raises(ValueError, match="required for file readiness"):
vm_manager_module._normalize_workspace_service_readiness({"type": "file"}) # noqa: SLF001
with pytest.raises(ValueError, match="HOST:PORT format"):
vm_manager_module._normalize_workspace_service_readiness( # noqa: SLF001
{"type": "tcp", "address": "127.0.0.1"}
)
with pytest.raises(ValueError, match="required for http readiness"):
vm_manager_module._normalize_workspace_service_readiness({"type": "http"}) # noqa: SLF001
with pytest.raises(ValueError, match="required for command readiness"):
vm_manager_module._normalize_workspace_service_readiness({"type": "command"}) # noqa: SLF001
def test_workspace_service_text_and_exit_code_helpers(tmp_path: Path) -> None:
status_path = tmp_path / "service.status"
assert vm_manager_module._read_service_exit_code(status_path) is None # noqa: SLF001
status_path.write_text("", encoding="utf-8")
assert vm_manager_module._read_service_exit_code(status_path) is None # noqa: SLF001
status_path.write_text("7\n", encoding="utf-8")
assert vm_manager_module._read_service_exit_code(status_path) == 7 # noqa: SLF001
log_path = tmp_path / "service.log"
assert vm_manager_module._tail_text(log_path, tail_lines=10) == ("", False) # noqa: SLF001
log_path.write_text("one\ntwo\nthree\n", encoding="utf-8")
assert vm_manager_module._tail_text(log_path, tail_lines=None) == ( # noqa: SLF001
"one\ntwo\nthree\n",
False,
)
assert vm_manager_module._tail_text(log_path, tail_lines=5) == ( # noqa: SLF001
"one\ntwo\nthree\n",
False,
)
assert vm_manager_module._tail_text(log_path, tail_lines=1) == ("three\n", True) # noqa: SLF001
def test_workspace_service_process_group_helpers(monkeypatch: pytest.MonkeyPatch) -> None:
def _missing(_pid: int, _signal: int) -> None:
raise ProcessLookupError()
monkeypatch.setattr("pyro_mcp.vm_manager.os.killpg", _missing)
assert vm_manager_module._stop_process_group(123) == (False, False) # noqa: SLF001
kill_calls: list[int] = []
monotonic_values = iter([0.0, 0.0, 5.0, 5.0, 10.0])
running_states = iter([True, True, False])
def _killpg(_pid: int, signum: int) -> None:
kill_calls.append(signum)
def _monotonic() -> float:
return next(monotonic_values)
def _is_running(_pid: int | None) -> bool:
return next(running_states)
monkeypatch.setattr("pyro_mcp.vm_manager.os.killpg", _killpg)
monkeypatch.setattr("pyro_mcp.vm_manager.time.monotonic", _monotonic)
monkeypatch.setattr("pyro_mcp.vm_manager.time.sleep", lambda _seconds: None)
monkeypatch.setattr("pyro_mcp.vm_manager._pid_is_running", _is_running)
stopped, killed = vm_manager_module._stop_process_group(456, wait_seconds=5) # noqa: SLF001
assert (stopped, killed) == (True, True)
assert kill_calls == [signal.SIGTERM, signal.SIGKILL]
def test_workspace_service_probe_and_refresh_helpers(
tmp_path: Path, monkeypatch: pytest.MonkeyPatch
) -> None:
assert vm_manager_module._run_service_probe_command(tmp_path, "exit 3") == 3 # noqa: SLF001
services_dir = tmp_path / "services"
services_dir.mkdir()
status_path = services_dir / "app.status"
status_path.write_text("9\n", encoding="utf-8")
running = vm_manager_module.WorkspaceServiceRecord( # noqa: SLF001
workspace_id="workspace-1",
service_name="app",
command="sleep 60",
cwd="/workspace",
state="running",
started_at=time.time(),
readiness=None,
ready_at=None,
ended_at=None,
exit_code=None,
pid=1234,
execution_mode="host_compat",
stop_reason=None,
)
monkeypatch.setattr("pyro_mcp.vm_manager._pid_is_running", lambda _pid: False)
refreshed = vm_manager_module._refresh_local_service_record( # noqa: SLF001
running,
services_dir=services_dir,
)
assert refreshed.state == "exited"
assert refreshed.exit_code == 9
monkeypatch.setattr(
"pyro_mcp.vm_manager._stop_process_group",
lambda _pid: (True, False),
)
stopped = vm_manager_module._stop_local_service( # noqa: SLF001
refreshed,
services_dir=services_dir,
)
assert stopped.state == "stopped"
assert stopped.stop_reason == "sigterm"