Add workspace service lifecycle with typed readiness

Make persistent workspaces capable of running long-lived background processes instead of forcing everything through one-shot exec calls. Add workspace service start/list/status/logs/stop across the CLI, Python SDK, and MCP server, with multiple named services per workspace, typed readiness probes (file, tcp, http, and command), and aggregate service counts on workspace status. Keep service state and logs outside /workspace so diff and export semantics stay workspace-scoped, and extend the guest agent plus backends to persist service records and logs across separate calls. Update the 2.7.0 docs, examples, changelog, and roadmap milestone to reflect the shipped surface. Validation: uv lock; UV_CACHE_DIR=.uv-cache make check; UV_CACHE_DIR=.uv-cache make dist-check; real guest-backed Firecracker smoke for workspace create, two service starts, list/status/logs, diff unaffected, stop, and delete.
2026-03-12 05:36:28 -03:00 · 2026-03-12 05:36:28 -03:00 · f504f0a331
commit f504f0a331
parent 84a7e18d4d
28 changed files with 4098 additions and 124 deletions
--- a/tests/test_vm_manager.py
+++ b/tests/test_vm_manager.py
@ -3,6 +3,7 @@ from __future__ import annotations
 import io
 import json
 import os
+import signal
 import subprocess
 import tarfile
 import time
@ -1144,3 +1145,369 @@ def test_reap_expired_workspaces_removes_invalid_and_expired_records(tmp_path: P

    assert not invalid_dir.exists()
    assert not (tmp_path / "vms" / "workspaces" / workspace_id).exists()
+
+
+def test_workspace_service_lifecycle_and_status_counts(tmp_path: Path) -> None:
+    manager = VmManager(
+        backend_name="mock",
+        base_dir=tmp_path / "vms",
+        network_manager=TapNetworkManager(enabled=False),
+    )
+    workspace_id = str(
+        manager.create_workspace(
+            environment="debian:12-base",
+            allow_host_compat=True,
+        )["workspace_id"]
+    )
+
+    started = manager.start_service(
+        workspace_id,
+        "app",
+        command="sh -lc 'printf \"service ready\\n\"; touch .ready; while true; do sleep 60; done'",
+        readiness={"type": "file", "path": ".ready"},
+    )
+    assert started["state"] == "running"
+
+    listed = manager.list_services(workspace_id)
+    assert listed["count"] == 1
+    assert listed["running_count"] == 1
+
+    status = manager.status_service(workspace_id, "app")
+    assert status["state"] == "running"
+    assert status["ready_at"] is not None
+
+    logs = manager.logs_service(workspace_id, "app")
+    assert "service ready" in str(logs["stdout"])
+
+    workspace_status = manager.status_workspace(workspace_id)
+    assert workspace_status["service_count"] == 1
+    assert workspace_status["running_service_count"] == 1
+
+    stopped = manager.stop_service(workspace_id, "app")
+    assert stopped["state"] == "stopped"
+    assert stopped["stop_reason"] in {"sigterm", "sigkill"}
+
+    deleted = manager.delete_workspace(workspace_id)
+    assert deleted["deleted"] is True
+
+
+def test_workspace_service_start_replaces_non_running_record(tmp_path: Path) -> None:
+    manager = VmManager(
+        backend_name="mock",
+        base_dir=tmp_path / "vms",
+        network_manager=TapNetworkManager(enabled=False),
+    )
+    workspace_id = str(
+        manager.create_workspace(
+            environment="debian:12-base",
+            allow_host_compat=True,
+        )["workspace_id"]
+    )
+
+    failed = manager.start_service(
+        workspace_id,
+        "app",
+        command="sh -lc 'exit 2'",
+        readiness={"type": "file", "path": ".ready"},
+        ready_timeout_seconds=1,
+        ready_interval_ms=50,
+    )
+    assert failed["state"] == "failed"
+
+    started = manager.start_service(
+        workspace_id,
+        "app",
+        command="sh -lc 'touch .ready; while true; do sleep 60; done'",
+        readiness={"type": "file", "path": ".ready"},
+    )
+    assert started["state"] == "running"
+    manager.delete_workspace(workspace_id)
+
+
+def test_workspace_service_supports_command_readiness_and_helper_probes(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    manager = VmManager(
+        backend_name="mock",
+        base_dir=tmp_path / "vms",
+        network_manager=TapNetworkManager(enabled=False),
+    )
+    workspace_id = str(
+        manager.create_workspace(
+            environment="debian:12-base",
+            allow_host_compat=True,
+        )["workspace_id"]
+    )
+
+    command_started = manager.start_service(
+        workspace_id,
+        "command-ready",
+        command="sh -lc 'touch command.ready; while true; do sleep 60; done'",
+        readiness={"type": "command", "command": "test -f command.ready"},
+    )
+    assert command_started["state"] == "running"
+
+    listed = manager.list_services(workspace_id)
+    assert listed["count"] == 1
+    assert listed["running_count"] == 1
+
+    status = manager.status_workspace(workspace_id)
+    assert status["service_count"] == 1
+    assert status["running_service_count"] == 1
+
+    assert manager.stop_service(workspace_id, "command-ready")["state"] == "stopped"
+
+    workspace_dir = tmp_path / "vms" / "workspaces" / workspace_id / "workspace"
+    ready_file = workspace_dir / "probe.ready"
+    ready_file.write_text("ok\n", encoding="utf-8")
+    assert vm_manager_module._service_ready_on_host(  # noqa: SLF001
+        readiness={"type": "file", "path": "/workspace/probe.ready"},
+        workspace_dir=workspace_dir,
+        cwd=workspace_dir,
+    )
+
+    class StubSocket:
+        def __enter__(self) -> StubSocket:
+            return self
+
+        def __exit__(self, *args: object) -> None:
+            del args
+
+        def settimeout(self, timeout: int) -> None:
+            assert timeout == 1
+
+        def connect(self, address: tuple[str, int]) -> None:
+            assert address == ("127.0.0.1", 8080)
+
+    monkeypatch.setattr("pyro_mcp.vm_manager.socket.socket", lambda *args: StubSocket())
+    assert vm_manager_module._service_ready_on_host(  # noqa: SLF001
+        readiness={"type": "tcp", "address": "127.0.0.1:8080"},
+        workspace_dir=workspace_dir,
+        cwd=workspace_dir,
+    )
+
+    class StubResponse:
+        status = 204
+
+        def __enter__(self) -> StubResponse:
+            return self
+
+        def __exit__(self, *args: object) -> None:
+            del args
+
+    def _urlopen(request: object, timeout: int) -> StubResponse:
+        del request
+        assert timeout == 2
+        return StubResponse()
+
+    monkeypatch.setattr("pyro_mcp.vm_manager.urllib.request.urlopen", _urlopen)
+    assert vm_manager_module._service_ready_on_host(  # noqa: SLF001
+        readiness={"type": "http", "url": "http://127.0.0.1:8080/"},
+        workspace_dir=workspace_dir,
+        cwd=workspace_dir,
+    )
+
+
+def test_workspace_service_logs_tail_and_delete_cleanup(tmp_path: Path) -> None:
+    manager = VmManager(
+        backend_name="mock",
+        base_dir=tmp_path / "vms",
+        network_manager=TapNetworkManager(enabled=False),
+    )
+    workspace_id = str(
+        manager.create_workspace(
+            environment="debian:12-base",
+            allow_host_compat=True,
+        )["workspace_id"]
+    )
+
+    manager.start_service(
+        workspace_id,
+        "logger",
+        command=(
+            "sh -lc 'printf \"one\\n\"; printf \"two\\n\"; "
+            "touch .ready; while true; do sleep 60; done'"
+        ),
+        readiness={"type": "file", "path": ".ready"},
+    )
+
+    logs = manager.logs_service(workspace_id, "logger", tail_lines=1)
+    assert logs["stdout"] == "two\n"
+    assert logs["truncated"] is True
+
+    services_dir = tmp_path / "vms" / "workspaces" / workspace_id / "services"
+    assert services_dir.exists()
+    deleted = manager.delete_workspace(workspace_id)
+    assert deleted["deleted"] is True
+    assert not services_dir.exists()
+
+
+def test_workspace_status_stops_service_counts_when_workspace_is_stopped(tmp_path: Path) -> None:
+    manager = VmManager(
+        backend_name="mock",
+        base_dir=tmp_path / "vms",
+        network_manager=TapNetworkManager(enabled=False),
+    )
+    workspace_id = str(
+        manager.create_workspace(
+            environment="debian:12-base",
+            allow_host_compat=True,
+        )["workspace_id"]
+    )
+    manager.start_service(
+        workspace_id,
+        "app",
+        command="sh -lc 'touch .ready; while true; do sleep 60; done'",
+        readiness={"type": "file", "path": ".ready"},
+    )
+    service_path = tmp_path / "vms" / "workspaces" / workspace_id / "services" / "app.json"
+    live_service_payload = json.loads(service_path.read_text(encoding="utf-8"))
+    live_pid = int(live_service_payload["pid"])
+
+    try:
+        workspace_path = tmp_path / "vms" / "workspaces" / workspace_id / "workspace.json"
+        payload = json.loads(workspace_path.read_text(encoding="utf-8"))
+        payload["state"] = "stopped"
+        workspace_path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8")
+
+        status = manager.status_workspace(workspace_id)
+        assert status["state"] == "stopped"
+        assert status["service_count"] == 1
+        assert status["running_service_count"] == 0
+
+        service_payload = json.loads(service_path.read_text(encoding="utf-8"))
+        assert service_payload["state"] == "stopped"
+        assert service_payload["stop_reason"] == "workspace_stopped"
+    finally:
+        vm_manager_module._stop_process_group(live_pid)  # noqa: SLF001
+
+
+def test_workspace_service_readiness_validation_helpers() -> None:
+    assert vm_manager_module._normalize_workspace_service_name("app-1") == "app-1"  # noqa: SLF001
+    with pytest.raises(ValueError, match="service_name must not be empty"):
+        vm_manager_module._normalize_workspace_service_name("   ")  # noqa: SLF001
+    with pytest.raises(ValueError, match="service_name must match"):
+        vm_manager_module._normalize_workspace_service_name("bad name")  # noqa: SLF001
+
+    assert vm_manager_module._normalize_workspace_service_readiness(  # noqa: SLF001
+        {"type": "file", "path": "subdir/.ready"}
+    ) == {"type": "file", "path": "/workspace/subdir/.ready"}
+    assert vm_manager_module._normalize_workspace_service_readiness(  # noqa: SLF001
+        {"type": "tcp", "address": "127.0.0.1:8080"}
+    ) == {"type": "tcp", "address": "127.0.0.1:8080"}
+    assert vm_manager_module._normalize_workspace_service_readiness(  # noqa: SLF001
+        {"type": "http", "url": "http://127.0.0.1:8080/"}
+    ) == {"type": "http", "url": "http://127.0.0.1:8080/"}
+    assert vm_manager_module._normalize_workspace_service_readiness(  # noqa: SLF001
+        {"type": "command", "command": "test -f .ready"}
+    ) == {"type": "command", "command": "test -f .ready"}
+
+    with pytest.raises(ValueError, match="one of: file, tcp, http, command"):
+        vm_manager_module._normalize_workspace_service_readiness({"type": "bogus"})  # noqa: SLF001
+    with pytest.raises(ValueError, match="required for file readiness"):
+        vm_manager_module._normalize_workspace_service_readiness({"type": "file"})  # noqa: SLF001
+    with pytest.raises(ValueError, match="HOST:PORT format"):
+        vm_manager_module._normalize_workspace_service_readiness(  # noqa: SLF001
+            {"type": "tcp", "address": "127.0.0.1"}
+        )
+    with pytest.raises(ValueError, match="required for http readiness"):
+        vm_manager_module._normalize_workspace_service_readiness({"type": "http"})  # noqa: SLF001
+    with pytest.raises(ValueError, match="required for command readiness"):
+        vm_manager_module._normalize_workspace_service_readiness({"type": "command"})  # noqa: SLF001
+
+
+def test_workspace_service_text_and_exit_code_helpers(tmp_path: Path) -> None:
+    status_path = tmp_path / "service.status"
+    assert vm_manager_module._read_service_exit_code(status_path) is None  # noqa: SLF001
+    status_path.write_text("", encoding="utf-8")
+    assert vm_manager_module._read_service_exit_code(status_path) is None  # noqa: SLF001
+    status_path.write_text("7\n", encoding="utf-8")
+    assert vm_manager_module._read_service_exit_code(status_path) == 7  # noqa: SLF001
+
+    log_path = tmp_path / "service.log"
+    assert vm_manager_module._tail_text(log_path, tail_lines=10) == ("", False)  # noqa: SLF001
+    log_path.write_text("one\ntwo\nthree\n", encoding="utf-8")
+    assert vm_manager_module._tail_text(log_path, tail_lines=None) == (  # noqa: SLF001
+        "one\ntwo\nthree\n",
+        False,
+    )
+    assert vm_manager_module._tail_text(log_path, tail_lines=5) == (  # noqa: SLF001
+        "one\ntwo\nthree\n",
+        False,
+    )
+    assert vm_manager_module._tail_text(log_path, tail_lines=1) == ("three\n", True)  # noqa: SLF001
+
+
+def test_workspace_service_process_group_helpers(monkeypatch: pytest.MonkeyPatch) -> None:
+    def _missing(_pid: int, _signal: int) -> None:
+        raise ProcessLookupError()
+
+    monkeypatch.setattr("pyro_mcp.vm_manager.os.killpg", _missing)
+    assert vm_manager_module._stop_process_group(123) == (False, False)  # noqa: SLF001
+
+    kill_calls: list[int] = []
+    monotonic_values = iter([0.0, 0.0, 5.0, 5.0, 10.0])
+    running_states = iter([True, True, False])
+
+    def _killpg(_pid: int, signum: int) -> None:
+        kill_calls.append(signum)
+
+    def _monotonic() -> float:
+        return next(monotonic_values)
+
+    def _is_running(_pid: int | None) -> bool:
+        return next(running_states)
+
+    monkeypatch.setattr("pyro_mcp.vm_manager.os.killpg", _killpg)
+    monkeypatch.setattr("pyro_mcp.vm_manager.time.monotonic", _monotonic)
+    monkeypatch.setattr("pyro_mcp.vm_manager.time.sleep", lambda _seconds: None)
+    monkeypatch.setattr("pyro_mcp.vm_manager._pid_is_running", _is_running)
+
+    stopped, killed = vm_manager_module._stop_process_group(456, wait_seconds=5)  # noqa: SLF001
+    assert (stopped, killed) == (True, True)
+    assert kill_calls == [signal.SIGTERM, signal.SIGKILL]
+
+
+def test_workspace_service_probe_and_refresh_helpers(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    assert vm_manager_module._run_service_probe_command(tmp_path, "exit 3") == 3  # noqa: SLF001
+
+    services_dir = tmp_path / "services"
+    services_dir.mkdir()
+    status_path = services_dir / "app.status"
+    status_path.write_text("9\n", encoding="utf-8")
+    running = vm_manager_module.WorkspaceServiceRecord(  # noqa: SLF001
+        workspace_id="workspace-1",
+        service_name="app",
+        command="sleep 60",
+        cwd="/workspace",
+        state="running",
+        started_at=time.time(),
+        readiness=None,
+        ready_at=None,
+        ended_at=None,
+        exit_code=None,
+        pid=1234,
+        execution_mode="host_compat",
+        stop_reason=None,
+    )
+
+    monkeypatch.setattr("pyro_mcp.vm_manager._pid_is_running", lambda _pid: False)
+    refreshed = vm_manager_module._refresh_local_service_record(  # noqa: SLF001
+        running,
+        services_dir=services_dir,
+    )
+    assert refreshed.state == "exited"
+    assert refreshed.exit_code == 9
+
+    monkeypatch.setattr(
+        "pyro_mcp.vm_manager._stop_process_group",
+        lambda _pid: (True, False),
+    )
+    stopped = vm_manager_module._stop_local_service(  # noqa: SLF001
+        refreshed,
+        services_dir=services_dir,
+    )
+    assert stopped.state == "stopped"
+    assert stopped.stop_reason == "sigterm"