Start the first workspace milestone toward the task-oriented product without changing the existing one-shot vm_run/pyro run contract. Add a disk-backed task registry in the manager, auto-started task workspaces rooted at /workspace, repeated non-cleaning exec, and persisted command journals exposed through task create/exec/status/logs/delete across the CLI, Python SDK, and MCP server. Update the public contract, docs, examples, and version/catalog metadata for 2.1.0, and cover the new surface with manager, CLI, SDK, and MCP tests. Validation: UV_CACHE_DIR=.uv-cache make check and UV_CACHE_DIR=.uv-cache make dist-check.
1232 lines
48 KiB
Python
1232 lines
48 KiB
Python
"""Lifecycle manager for ephemeral VM environments and persistent tasks."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import os
|
|
import shlex
|
|
import shutil
|
|
import signal
|
|
import subprocess
|
|
import threading
|
|
import time
|
|
import uuid
|
|
from dataclasses import dataclass, field
|
|
from pathlib import Path
|
|
from typing import Any, Literal, cast
|
|
|
|
from pyro_mcp.runtime import (
|
|
RuntimeCapabilities,
|
|
RuntimePaths,
|
|
resolve_runtime_paths,
|
|
runtime_capabilities,
|
|
)
|
|
from pyro_mcp.vm_environments import EnvironmentStore, default_cache_dir, get_environment
|
|
from pyro_mcp.vm_firecracker import build_launch_plan
|
|
from pyro_mcp.vm_guest import VsockExecClient
|
|
from pyro_mcp.vm_network import NetworkConfig, TapNetworkManager
|
|
|
|
VmState = Literal["created", "started", "stopped"]
|
|
|
|
DEFAULT_VCPU_COUNT = 1
|
|
DEFAULT_MEM_MIB = 1024
|
|
DEFAULT_TIMEOUT_SECONDS = 30
|
|
DEFAULT_TTL_SECONDS = 600
|
|
DEFAULT_ALLOW_HOST_COMPAT = False
|
|
|
|
TASK_LAYOUT_VERSION = 1
|
|
TASK_WORKSPACE_DIRNAME = "workspace"
|
|
TASK_COMMANDS_DIRNAME = "commands"
|
|
TASK_RUNTIME_DIRNAME = "runtime"
|
|
TASK_WORKSPACE_GUEST_PATH = "/workspace"
|
|
|
|
|
|
@dataclass
|
|
class VmInstance:
|
|
"""In-memory VM lifecycle record."""
|
|
|
|
vm_id: str
|
|
environment: str
|
|
vcpu_count: int
|
|
mem_mib: int
|
|
ttl_seconds: int
|
|
created_at: float
|
|
expires_at: float
|
|
workdir: Path
|
|
state: VmState = "created"
|
|
network_requested: bool = False
|
|
allow_host_compat: bool = DEFAULT_ALLOW_HOST_COMPAT
|
|
firecracker_pid: int | None = None
|
|
last_error: str | None = None
|
|
metadata: dict[str, str] = field(default_factory=dict)
|
|
network: NetworkConfig | None = None
|
|
|
|
|
|
@dataclass
|
|
class TaskRecord:
|
|
"""Persistent task metadata stored on disk."""
|
|
|
|
task_id: str
|
|
environment: str
|
|
vcpu_count: int
|
|
mem_mib: int
|
|
ttl_seconds: int
|
|
created_at: float
|
|
expires_at: float
|
|
state: VmState
|
|
network_requested: bool
|
|
allow_host_compat: bool
|
|
firecracker_pid: int | None = None
|
|
last_error: str | None = None
|
|
metadata: dict[str, str] = field(default_factory=dict)
|
|
network: NetworkConfig | None = None
|
|
command_count: int = 0
|
|
last_command: dict[str, Any] | None = None
|
|
|
|
@classmethod
|
|
def from_instance(
|
|
cls,
|
|
instance: VmInstance,
|
|
*,
|
|
command_count: int = 0,
|
|
last_command: dict[str, Any] | None = None,
|
|
) -> TaskRecord:
|
|
return cls(
|
|
task_id=instance.vm_id,
|
|
environment=instance.environment,
|
|
vcpu_count=instance.vcpu_count,
|
|
mem_mib=instance.mem_mib,
|
|
ttl_seconds=instance.ttl_seconds,
|
|
created_at=instance.created_at,
|
|
expires_at=instance.expires_at,
|
|
state=instance.state,
|
|
network_requested=instance.network_requested,
|
|
allow_host_compat=instance.allow_host_compat,
|
|
firecracker_pid=instance.firecracker_pid,
|
|
last_error=instance.last_error,
|
|
metadata=dict(instance.metadata),
|
|
network=instance.network,
|
|
command_count=command_count,
|
|
last_command=last_command,
|
|
)
|
|
|
|
def to_instance(self, *, workdir: Path) -> VmInstance:
|
|
return VmInstance(
|
|
vm_id=self.task_id,
|
|
environment=self.environment,
|
|
vcpu_count=self.vcpu_count,
|
|
mem_mib=self.mem_mib,
|
|
ttl_seconds=self.ttl_seconds,
|
|
created_at=self.created_at,
|
|
expires_at=self.expires_at,
|
|
workdir=workdir,
|
|
state=self.state,
|
|
network_requested=self.network_requested,
|
|
allow_host_compat=self.allow_host_compat,
|
|
firecracker_pid=self.firecracker_pid,
|
|
last_error=self.last_error,
|
|
metadata=dict(self.metadata),
|
|
network=self.network,
|
|
)
|
|
|
|
def to_payload(self) -> dict[str, Any]:
|
|
return {
|
|
"layout_version": TASK_LAYOUT_VERSION,
|
|
"task_id": self.task_id,
|
|
"environment": self.environment,
|
|
"vcpu_count": self.vcpu_count,
|
|
"mem_mib": self.mem_mib,
|
|
"ttl_seconds": self.ttl_seconds,
|
|
"created_at": self.created_at,
|
|
"expires_at": self.expires_at,
|
|
"state": self.state,
|
|
"network_requested": self.network_requested,
|
|
"allow_host_compat": self.allow_host_compat,
|
|
"firecracker_pid": self.firecracker_pid,
|
|
"last_error": self.last_error,
|
|
"metadata": self.metadata,
|
|
"network": _serialize_network(self.network),
|
|
"command_count": self.command_count,
|
|
"last_command": self.last_command,
|
|
}
|
|
|
|
@classmethod
|
|
def from_payload(cls, payload: dict[str, Any]) -> TaskRecord:
|
|
return cls(
|
|
task_id=str(payload["task_id"]),
|
|
environment=str(payload["environment"]),
|
|
vcpu_count=int(payload["vcpu_count"]),
|
|
mem_mib=int(payload["mem_mib"]),
|
|
ttl_seconds=int(payload["ttl_seconds"]),
|
|
created_at=float(payload["created_at"]),
|
|
expires_at=float(payload["expires_at"]),
|
|
state=cast(VmState, str(payload.get("state", "stopped"))),
|
|
network_requested=bool(payload.get("network_requested", False)),
|
|
allow_host_compat=bool(payload.get("allow_host_compat", DEFAULT_ALLOW_HOST_COMPAT)),
|
|
firecracker_pid=_optional_int(payload.get("firecracker_pid")),
|
|
last_error=_optional_str(payload.get("last_error")),
|
|
metadata=_string_dict(payload.get("metadata")),
|
|
network=_deserialize_network(payload.get("network")),
|
|
command_count=int(payload.get("command_count", 0)),
|
|
last_command=_optional_dict(payload.get("last_command")),
|
|
)
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class VmExecResult:
|
|
"""Command execution output."""
|
|
|
|
stdout: str
|
|
stderr: str
|
|
exit_code: int
|
|
duration_ms: int
|
|
|
|
|
|
def _optional_int(value: object) -> int | None:
|
|
if value is None:
|
|
return None
|
|
if isinstance(value, bool):
|
|
return int(value)
|
|
if isinstance(value, int):
|
|
return value
|
|
if isinstance(value, float):
|
|
return int(value)
|
|
if isinstance(value, str):
|
|
return int(value)
|
|
raise TypeError("expected integer-compatible payload")
|
|
|
|
|
|
def _optional_str(value: object) -> str | None:
|
|
if value is None:
|
|
return None
|
|
return str(value)
|
|
|
|
|
|
def _optional_dict(value: object) -> dict[str, Any] | None:
|
|
if value is None:
|
|
return None
|
|
if not isinstance(value, dict):
|
|
raise TypeError("expected dictionary payload")
|
|
return dict(value)
|
|
|
|
|
|
def _string_dict(value: object) -> dict[str, str]:
|
|
if not isinstance(value, dict):
|
|
return {}
|
|
return {str(key): str(item) for key, item in value.items()}
|
|
|
|
|
|
def _serialize_network(network: NetworkConfig | None) -> dict[str, Any] | None:
|
|
if network is None:
|
|
return None
|
|
return {
|
|
"vm_id": network.vm_id,
|
|
"tap_name": network.tap_name,
|
|
"guest_ip": network.guest_ip,
|
|
"gateway_ip": network.gateway_ip,
|
|
"subnet_cidr": network.subnet_cidr,
|
|
"mac_address": network.mac_address,
|
|
"dns_servers": list(network.dns_servers),
|
|
}
|
|
|
|
|
|
def _deserialize_network(payload: object) -> NetworkConfig | None:
|
|
if payload is None:
|
|
return None
|
|
if not isinstance(payload, dict):
|
|
raise TypeError("expected dictionary payload")
|
|
dns_servers = payload.get("dns_servers", [])
|
|
dns_values = tuple(str(item) for item in dns_servers) if isinstance(dns_servers, list) else ()
|
|
return NetworkConfig(
|
|
vm_id=str(payload["vm_id"]),
|
|
tap_name=str(payload["tap_name"]),
|
|
guest_ip=str(payload["guest_ip"]),
|
|
gateway_ip=str(payload["gateway_ip"]),
|
|
subnet_cidr=str(payload["subnet_cidr"]),
|
|
mac_address=str(payload["mac_address"]),
|
|
dns_servers=dns_values,
|
|
)
|
|
|
|
|
|
def _run_host_command(workdir: Path, command: str, timeout_seconds: int) -> VmExecResult:
|
|
started = time.monotonic()
|
|
env = {"PATH": os.environ.get("PATH", ""), "HOME": str(workdir)}
|
|
try:
|
|
proc = subprocess.run( # noqa: S603
|
|
["bash", "-lc", command], # noqa: S607
|
|
cwd=workdir,
|
|
env=env,
|
|
text=True,
|
|
capture_output=True,
|
|
timeout=timeout_seconds,
|
|
check=False,
|
|
)
|
|
return VmExecResult(
|
|
stdout=proc.stdout,
|
|
stderr=proc.stderr,
|
|
exit_code=proc.returncode,
|
|
duration_ms=int((time.monotonic() - started) * 1000),
|
|
)
|
|
except subprocess.TimeoutExpired:
|
|
return VmExecResult(
|
|
stdout="",
|
|
stderr=f"command timed out after {timeout_seconds}s",
|
|
exit_code=124,
|
|
duration_ms=int((time.monotonic() - started) * 1000),
|
|
)
|
|
|
|
|
|
def _copy_rootfs(source: Path, dest: Path) -> str:
|
|
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
try:
|
|
proc = subprocess.run( # noqa: S603
|
|
["cp", "--reflink=auto", str(source), str(dest)],
|
|
text=True,
|
|
capture_output=True,
|
|
check=False,
|
|
)
|
|
if proc.returncode == 0:
|
|
return "reflink_or_copy"
|
|
except OSError:
|
|
pass
|
|
shutil.copy2(source, dest)
|
|
return "copy2"
|
|
|
|
|
|
def _wrap_guest_command(command: str, *, cwd: str | None = None) -> str:
|
|
if cwd is None:
|
|
return command
|
|
quoted_cwd = shlex.quote(cwd)
|
|
return f"mkdir -p {quoted_cwd} && cd {quoted_cwd} && {command}"
|
|
|
|
|
|
def _pid_is_running(pid: int | None) -> bool:
|
|
if pid is None:
|
|
return False
|
|
try:
|
|
os.kill(pid, 0)
|
|
except ProcessLookupError:
|
|
return False
|
|
except PermissionError:
|
|
return True
|
|
return True
|
|
|
|
|
|
class VmBackend:
|
|
"""Backend interface for lifecycle operations."""
|
|
|
|
def create(self, instance: VmInstance) -> None: # pragma: no cover
|
|
raise NotImplementedError
|
|
|
|
def start(self, instance: VmInstance) -> None: # pragma: no cover
|
|
raise NotImplementedError
|
|
|
|
def exec( # pragma: no cover
|
|
self,
|
|
instance: VmInstance,
|
|
command: str,
|
|
timeout_seconds: int,
|
|
*,
|
|
workdir: Path | None = None,
|
|
) -> VmExecResult:
|
|
raise NotImplementedError
|
|
|
|
def stop(self, instance: VmInstance) -> None: # pragma: no cover
|
|
raise NotImplementedError
|
|
|
|
def delete(self, instance: VmInstance) -> None: # pragma: no cover
|
|
raise NotImplementedError
|
|
|
|
|
|
class MockBackend(VmBackend):
|
|
"""Host-process backend used for development and testability."""
|
|
|
|
def create(self, instance: VmInstance) -> None:
|
|
instance.workdir.mkdir(parents=True, exist_ok=False)
|
|
|
|
def start(self, instance: VmInstance) -> None:
|
|
marker_path = instance.workdir / ".started"
|
|
marker_path.write_text("started\n", encoding="utf-8")
|
|
|
|
def exec(
|
|
self,
|
|
instance: VmInstance,
|
|
command: str,
|
|
timeout_seconds: int,
|
|
*,
|
|
workdir: Path | None = None,
|
|
) -> VmExecResult:
|
|
return _run_host_command(workdir or instance.workdir, command, timeout_seconds)
|
|
|
|
def stop(self, instance: VmInstance) -> None:
|
|
marker_path = instance.workdir / ".stopped"
|
|
marker_path.write_text("stopped\n", encoding="utf-8")
|
|
|
|
def delete(self, instance: VmInstance) -> None:
|
|
shutil.rmtree(instance.workdir, ignore_errors=True)
|
|
|
|
|
|
class FirecrackerBackend(VmBackend): # pragma: no cover
|
|
"""Host-gated backend that validates Firecracker prerequisites."""
|
|
|
|
def __init__(
|
|
self,
|
|
environment_store: EnvironmentStore,
|
|
firecracker_bin: Path,
|
|
jailer_bin: Path,
|
|
runtime_capabilities: RuntimeCapabilities,
|
|
network_manager: TapNetworkManager | None = None,
|
|
guest_exec_client: VsockExecClient | None = None,
|
|
) -> None:
|
|
self._environment_store = environment_store
|
|
self._firecracker_bin = firecracker_bin
|
|
self._jailer_bin = jailer_bin
|
|
self._runtime_capabilities = runtime_capabilities
|
|
self._network_manager = network_manager or TapNetworkManager()
|
|
self._guest_exec_client = guest_exec_client or VsockExecClient()
|
|
self._processes: dict[str, subprocess.Popen[str]] = {}
|
|
if not self._firecracker_bin.exists():
|
|
raise RuntimeError(f"bundled firecracker binary not found at {self._firecracker_bin}")
|
|
if not self._jailer_bin.exists():
|
|
raise RuntimeError(f"bundled jailer binary not found at {self._jailer_bin}")
|
|
if not Path("/dev/kvm").exists():
|
|
raise RuntimeError("/dev/kvm is not available on this host")
|
|
|
|
def create(self, instance: VmInstance) -> None:
|
|
instance.workdir.mkdir(parents=True, exist_ok=False)
|
|
try:
|
|
installed_environment = self._environment_store.ensure_installed(instance.environment)
|
|
if (
|
|
not installed_environment.kernel_image.exists()
|
|
or not installed_environment.rootfs_image.exists()
|
|
):
|
|
raise RuntimeError(
|
|
f"missing environment artifacts for {instance.environment}; expected "
|
|
f"{installed_environment.kernel_image} and {installed_environment.rootfs_image}"
|
|
)
|
|
instance.metadata["environment_version"] = installed_environment.version
|
|
instance.metadata["environment_source"] = installed_environment.source
|
|
if installed_environment.source_digest is not None:
|
|
instance.metadata["environment_digest"] = installed_environment.source_digest
|
|
instance.metadata["environment_install_dir"] = str(installed_environment.install_dir)
|
|
instance.metadata["kernel_image"] = str(installed_environment.kernel_image)
|
|
rootfs_copy = instance.workdir / "rootfs.ext4"
|
|
instance.metadata["rootfs_clone_mode"] = _copy_rootfs(
|
|
installed_environment.rootfs_image,
|
|
rootfs_copy,
|
|
)
|
|
instance.metadata["rootfs_image"] = str(rootfs_copy)
|
|
if instance.network_requested:
|
|
network = self._network_manager.allocate(instance.vm_id)
|
|
instance.network = network
|
|
instance.metadata.update(self._network_manager.to_metadata(network))
|
|
else:
|
|
instance.network = None
|
|
instance.metadata["network_enabled"] = "false"
|
|
except Exception:
|
|
shutil.rmtree(instance.workdir, ignore_errors=True)
|
|
raise
|
|
|
|
def start(self, instance: VmInstance) -> None:
|
|
launch_plan = build_launch_plan(instance)
|
|
instance.metadata["firecracker_config_path"] = str(launch_plan.config_path)
|
|
instance.metadata["guest_network_path"] = str(launch_plan.guest_network_path)
|
|
instance.metadata["guest_exec_path"] = str(launch_plan.guest_exec_path)
|
|
instance.metadata["guest_cid"] = str(launch_plan.guest_cid)
|
|
instance.metadata["guest_exec_port"] = str(launch_plan.vsock_port)
|
|
instance.metadata["guest_exec_uds_path"] = str(instance.workdir / "vsock.sock")
|
|
serial_log_path = instance.workdir / "serial.log"
|
|
firecracker_log_path = instance.workdir / "firecracker.log"
|
|
firecracker_log_path.touch()
|
|
instance.metadata["serial_log_path"] = str(serial_log_path)
|
|
instance.metadata["firecracker_log_path"] = str(firecracker_log_path)
|
|
proc = subprocess.run( # noqa: S603
|
|
[str(self._firecracker_bin), "--version"],
|
|
text=True,
|
|
capture_output=True,
|
|
check=False,
|
|
)
|
|
if proc.returncode != 0:
|
|
raise RuntimeError(f"firecracker startup preflight failed: {proc.stderr.strip()}")
|
|
instance.metadata["firecracker_version"] = proc.stdout.strip()
|
|
instance.metadata["jailer_path"] = str(self._jailer_bin)
|
|
if not self._runtime_capabilities.supports_vm_boot:
|
|
instance.metadata["execution_mode"] = "host_compat"
|
|
instance.metadata["boot_mode"] = "shim"
|
|
if self._runtime_capabilities.reason is not None:
|
|
instance.metadata["runtime_reason"] = self._runtime_capabilities.reason
|
|
return
|
|
with serial_log_path.open("w", encoding="utf-8") as serial_fp:
|
|
process = subprocess.Popen( # noqa: S603
|
|
[
|
|
str(self._firecracker_bin),
|
|
"--no-api",
|
|
"--config-file",
|
|
str(launch_plan.config_path),
|
|
"--log-path",
|
|
str(firecracker_log_path),
|
|
"--level",
|
|
"Info",
|
|
],
|
|
stdout=serial_fp,
|
|
stderr=subprocess.STDOUT,
|
|
text=True,
|
|
start_new_session=True,
|
|
)
|
|
self._processes[instance.vm_id] = process
|
|
time.sleep(2)
|
|
if process.poll() is not None:
|
|
serial_log = serial_log_path.read_text(encoding="utf-8", errors="ignore")
|
|
firecracker_log = firecracker_log_path.read_text(encoding="utf-8", errors="ignore")
|
|
self._processes.pop(instance.vm_id, None)
|
|
raise RuntimeError(
|
|
"firecracker microVM exited during startup: "
|
|
f"{(serial_log or firecracker_log).strip()}"
|
|
)
|
|
instance.firecracker_pid = process.pid
|
|
instance.metadata["execution_mode"] = (
|
|
"guest_vsock" if self._runtime_capabilities.supports_guest_exec else "guest_boot_only"
|
|
)
|
|
instance.metadata["boot_mode"] = "native"
|
|
|
|
def exec(
|
|
self,
|
|
instance: VmInstance,
|
|
command: str,
|
|
timeout_seconds: int,
|
|
*,
|
|
workdir: Path | None = None,
|
|
) -> VmExecResult:
|
|
if self._runtime_capabilities.supports_guest_exec:
|
|
guest_cid = int(instance.metadata["guest_cid"])
|
|
port = int(instance.metadata["guest_exec_port"])
|
|
uds_path = instance.metadata.get("guest_exec_uds_path")
|
|
deadline = time.monotonic() + min(timeout_seconds, 10)
|
|
while True:
|
|
try:
|
|
response = self._guest_exec_client.exec(
|
|
guest_cid,
|
|
port,
|
|
command,
|
|
timeout_seconds,
|
|
uds_path=uds_path,
|
|
)
|
|
break
|
|
except (OSError, RuntimeError) as exc:
|
|
if time.monotonic() >= deadline:
|
|
raise RuntimeError(
|
|
f"guest exec transport did not become ready: {exc}"
|
|
) from exc
|
|
time.sleep(0.2)
|
|
return VmExecResult(
|
|
stdout=response.stdout,
|
|
stderr=response.stderr,
|
|
exit_code=response.exit_code,
|
|
duration_ms=response.duration_ms,
|
|
)
|
|
instance.metadata["execution_mode"] = "host_compat"
|
|
return _run_host_command(workdir or instance.workdir, command, timeout_seconds)
|
|
|
|
def stop(self, instance: VmInstance) -> None:
|
|
process = self._processes.pop(instance.vm_id, None)
|
|
if process is not None:
|
|
process.terminate()
|
|
try:
|
|
process.wait(timeout=5)
|
|
except subprocess.TimeoutExpired:
|
|
process.kill()
|
|
process.wait(timeout=5)
|
|
instance.firecracker_pid = None
|
|
return
|
|
if instance.firecracker_pid is None:
|
|
return
|
|
try:
|
|
os.kill(instance.firecracker_pid, signal.SIGTERM)
|
|
except ProcessLookupError:
|
|
instance.firecracker_pid = None
|
|
return
|
|
deadline = time.monotonic() + 5
|
|
while time.monotonic() < deadline:
|
|
try:
|
|
os.kill(instance.firecracker_pid, 0)
|
|
except ProcessLookupError:
|
|
instance.firecracker_pid = None
|
|
return
|
|
time.sleep(0.1)
|
|
os.kill(instance.firecracker_pid, signal.SIGKILL)
|
|
instance.firecracker_pid = None
|
|
|
|
def delete(self, instance: VmInstance) -> None:
|
|
self._processes.pop(instance.vm_id, None)
|
|
if instance.network is not None:
|
|
self._network_manager.cleanup(instance.network)
|
|
shutil.rmtree(instance.workdir, ignore_errors=True)
|
|
|
|
|
|
class VmManager:
|
|
"""In-process lifecycle manager for ephemeral VM environments and tasks."""
|
|
|
|
MIN_VCPUS = 1
|
|
MAX_VCPUS = 8
|
|
MIN_MEM_MIB = 256
|
|
MAX_MEM_MIB = 32768
|
|
MIN_TTL_SECONDS = 60
|
|
MAX_TTL_SECONDS = 3600
|
|
DEFAULT_VCPU_COUNT = DEFAULT_VCPU_COUNT
|
|
DEFAULT_MEM_MIB = DEFAULT_MEM_MIB
|
|
DEFAULT_TIMEOUT_SECONDS = DEFAULT_TIMEOUT_SECONDS
|
|
DEFAULT_TTL_SECONDS = DEFAULT_TTL_SECONDS
|
|
DEFAULT_ALLOW_HOST_COMPAT = DEFAULT_ALLOW_HOST_COMPAT
|
|
|
|
def __init__(
|
|
self,
|
|
*,
|
|
backend_name: str | None = None,
|
|
base_dir: Path | None = None,
|
|
cache_dir: Path | None = None,
|
|
max_active_vms: int = 4,
|
|
runtime_paths: RuntimePaths | None = None,
|
|
network_manager: TapNetworkManager | None = None,
|
|
) -> None:
|
|
self._backend_name = backend_name or "firecracker"
|
|
self._base_dir = base_dir or Path("/tmp/pyro-mcp")
|
|
self._tasks_dir = self._base_dir / "tasks"
|
|
resolved_cache_dir = cache_dir or default_cache_dir()
|
|
self._runtime_paths = runtime_paths
|
|
if self._backend_name == "firecracker":
|
|
self._runtime_paths = self._runtime_paths or resolve_runtime_paths()
|
|
self._runtime_capabilities = runtime_capabilities(self._runtime_paths)
|
|
self._environment_store = EnvironmentStore(
|
|
runtime_paths=self._runtime_paths,
|
|
cache_dir=resolved_cache_dir,
|
|
)
|
|
else:
|
|
self._runtime_capabilities = RuntimeCapabilities(
|
|
supports_vm_boot=False,
|
|
supports_guest_exec=False,
|
|
supports_guest_network=False,
|
|
reason="mock backend does not boot a guest",
|
|
)
|
|
if self._runtime_paths is None:
|
|
self._runtime_paths = resolve_runtime_paths(verify_checksums=False)
|
|
self._environment_store = EnvironmentStore(
|
|
runtime_paths=self._runtime_paths,
|
|
cache_dir=resolved_cache_dir,
|
|
)
|
|
self._max_active_vms = max_active_vms
|
|
if network_manager is not None:
|
|
self._network_manager = network_manager
|
|
elif self._backend_name == "firecracker":
|
|
self._network_manager = TapNetworkManager(enabled=True)
|
|
else:
|
|
self._network_manager = TapNetworkManager(enabled=False)
|
|
self._lock = threading.Lock()
|
|
self._instances: dict[str, VmInstance] = {}
|
|
self._base_dir.mkdir(parents=True, exist_ok=True)
|
|
self._tasks_dir.mkdir(parents=True, exist_ok=True)
|
|
self._backend = self._build_backend()
|
|
|
|
def _build_backend(self) -> VmBackend:
|
|
if self._backend_name == "mock":
|
|
return MockBackend()
|
|
if self._backend_name == "firecracker":
|
|
if self._runtime_paths is None:
|
|
raise RuntimeError("runtime paths were not initialized for firecracker backend")
|
|
return FirecrackerBackend(
|
|
self._environment_store,
|
|
firecracker_bin=self._runtime_paths.firecracker_bin,
|
|
jailer_bin=self._runtime_paths.jailer_bin,
|
|
runtime_capabilities=self._runtime_capabilities,
|
|
network_manager=self._network_manager,
|
|
)
|
|
raise ValueError("invalid backend; expected one of: mock, firecracker")
|
|
|
|
def list_environments(self) -> list[dict[str, object]]:
|
|
return self._environment_store.list_environments()
|
|
|
|
def pull_environment(self, environment: str) -> dict[str, object]:
|
|
return self._environment_store.pull_environment(environment)
|
|
|
|
def inspect_environment(self, environment: str) -> dict[str, object]:
|
|
return self._environment_store.inspect_environment(environment)
|
|
|
|
def prune_environments(self) -> dict[str, object]:
|
|
return self._environment_store.prune_environments()
|
|
|
|
def create_vm(
|
|
self,
|
|
*,
|
|
environment: str,
|
|
vcpu_count: int = DEFAULT_VCPU_COUNT,
|
|
mem_mib: int = DEFAULT_MEM_MIB,
|
|
ttl_seconds: int = DEFAULT_TTL_SECONDS,
|
|
network: bool = False,
|
|
allow_host_compat: bool = DEFAULT_ALLOW_HOST_COMPAT,
|
|
) -> dict[str, Any]:
|
|
self._validate_limits(vcpu_count=vcpu_count, mem_mib=mem_mib, ttl_seconds=ttl_seconds)
|
|
get_environment(environment, runtime_paths=self._runtime_paths)
|
|
now = time.time()
|
|
with self._lock:
|
|
self._reap_expired_locked(now)
|
|
self._reap_expired_tasks_locked(now)
|
|
active_count = len(self._instances) + self._count_tasks_locked()
|
|
if active_count >= self._max_active_vms:
|
|
raise RuntimeError(
|
|
f"max active VMs reached ({self._max_active_vms}); delete old VMs first"
|
|
)
|
|
vm_id = uuid.uuid4().hex[:12]
|
|
instance = VmInstance(
|
|
vm_id=vm_id,
|
|
environment=environment,
|
|
vcpu_count=vcpu_count,
|
|
mem_mib=mem_mib,
|
|
ttl_seconds=ttl_seconds,
|
|
created_at=now,
|
|
expires_at=now + ttl_seconds,
|
|
workdir=self._base_dir / vm_id,
|
|
network_requested=network,
|
|
allow_host_compat=allow_host_compat,
|
|
)
|
|
instance.metadata["allow_host_compat"] = str(allow_host_compat).lower()
|
|
self._backend.create(instance)
|
|
self._instances[vm_id] = instance
|
|
return self._serialize(instance)
|
|
|
|
def run_vm(
|
|
self,
|
|
*,
|
|
environment: str,
|
|
command: str,
|
|
vcpu_count: int = DEFAULT_VCPU_COUNT,
|
|
mem_mib: int = DEFAULT_MEM_MIB,
|
|
timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS,
|
|
ttl_seconds: int = DEFAULT_TTL_SECONDS,
|
|
network: bool = False,
|
|
allow_host_compat: bool = DEFAULT_ALLOW_HOST_COMPAT,
|
|
) -> dict[str, Any]:
|
|
created = self.create_vm(
|
|
environment=environment,
|
|
vcpu_count=vcpu_count,
|
|
mem_mib=mem_mib,
|
|
ttl_seconds=ttl_seconds,
|
|
network=network,
|
|
allow_host_compat=allow_host_compat,
|
|
)
|
|
vm_id = str(created["vm_id"])
|
|
try:
|
|
self.start_vm(vm_id)
|
|
return self.exec_vm(vm_id, command=command, timeout_seconds=timeout_seconds)
|
|
except Exception:
|
|
try:
|
|
self.delete_vm(vm_id, reason="run_vm_error_cleanup")
|
|
except ValueError:
|
|
pass
|
|
raise
|
|
|
|
def start_vm(self, vm_id: str) -> dict[str, Any]:
|
|
with self._lock:
|
|
instance = self._get_instance_locked(vm_id)
|
|
self._ensure_not_expired_locked(instance, time.time())
|
|
self._start_instance_locked(instance)
|
|
return self._serialize(instance)
|
|
|
|
def exec_vm(self, vm_id: str, *, command: str, timeout_seconds: int) -> dict[str, Any]:
|
|
with self._lock:
|
|
instance = self._get_instance_locked(vm_id)
|
|
self._ensure_not_expired_locked(instance, time.time())
|
|
exec_instance = instance
|
|
exec_result, execution_mode = self._exec_instance(
|
|
exec_instance,
|
|
command=command,
|
|
timeout_seconds=timeout_seconds,
|
|
)
|
|
cleanup = self.delete_vm(vm_id, reason="post_exec_cleanup")
|
|
return {
|
|
"vm_id": vm_id,
|
|
"environment": exec_instance.environment,
|
|
"environment_version": exec_instance.metadata.get("environment_version"),
|
|
"command": command,
|
|
"stdout": exec_result.stdout,
|
|
"stderr": exec_result.stderr,
|
|
"exit_code": exec_result.exit_code,
|
|
"duration_ms": exec_result.duration_ms,
|
|
"execution_mode": execution_mode,
|
|
"cleanup": cleanup,
|
|
}
|
|
|
|
def stop_vm(self, vm_id: str) -> dict[str, Any]:
|
|
with self._lock:
|
|
instance = self._get_instance_locked(vm_id)
|
|
self._backend.stop(instance)
|
|
instance.state = "stopped"
|
|
return self._serialize(instance)
|
|
|
|
def delete_vm(self, vm_id: str, *, reason: str = "explicit_delete") -> dict[str, Any]:
|
|
with self._lock:
|
|
instance = self._get_instance_locked(vm_id)
|
|
if instance.state == "started":
|
|
self._backend.stop(instance)
|
|
instance.state = "stopped"
|
|
self._backend.delete(instance)
|
|
del self._instances[vm_id]
|
|
return {"vm_id": vm_id, "deleted": True, "reason": reason}
|
|
|
|
def status_vm(self, vm_id: str) -> dict[str, Any]:
|
|
with self._lock:
|
|
instance = self._get_instance_locked(vm_id)
|
|
self._ensure_not_expired_locked(instance, time.time())
|
|
return self._serialize(instance)
|
|
|
|
def network_info_vm(self, vm_id: str) -> dict[str, Any]:
|
|
with self._lock:
|
|
instance = self._get_instance_locked(vm_id)
|
|
self._ensure_not_expired_locked(instance, time.time())
|
|
if instance.network is None:
|
|
return {
|
|
"vm_id": vm_id,
|
|
"network_enabled": False,
|
|
"outbound_connectivity_expected": False,
|
|
"reason": "network configuration is unavailable for this VM",
|
|
}
|
|
return {"vm_id": vm_id, **self._network_manager.network_info(instance.network)}
|
|
|
|
def reap_expired(self) -> dict[str, Any]:
|
|
now = time.time()
|
|
with self._lock:
|
|
expired_vm_ids = [
|
|
vm_id for vm_id, inst in self._instances.items() if inst.expires_at <= now
|
|
]
|
|
for vm_id in expired_vm_ids:
|
|
instance = self._instances[vm_id]
|
|
if instance.state == "started":
|
|
self._backend.stop(instance)
|
|
instance.state = "stopped"
|
|
self._backend.delete(instance)
|
|
del self._instances[vm_id]
|
|
return {"deleted_vm_ids": expired_vm_ids, "count": len(expired_vm_ids)}
|
|
|
|
def create_task(
|
|
self,
|
|
*,
|
|
environment: str,
|
|
vcpu_count: int = DEFAULT_VCPU_COUNT,
|
|
mem_mib: int = DEFAULT_MEM_MIB,
|
|
ttl_seconds: int = DEFAULT_TTL_SECONDS,
|
|
network: bool = False,
|
|
allow_host_compat: bool = DEFAULT_ALLOW_HOST_COMPAT,
|
|
) -> dict[str, Any]:
|
|
self._validate_limits(vcpu_count=vcpu_count, mem_mib=mem_mib, ttl_seconds=ttl_seconds)
|
|
get_environment(environment, runtime_paths=self._runtime_paths)
|
|
now = time.time()
|
|
task_id = uuid.uuid4().hex[:12]
|
|
task_dir = self._task_dir(task_id)
|
|
runtime_dir = self._task_runtime_dir(task_id)
|
|
workspace_dir = self._task_workspace_dir(task_id)
|
|
commands_dir = self._task_commands_dir(task_id)
|
|
task_dir.mkdir(parents=True, exist_ok=False)
|
|
workspace_dir.mkdir(parents=True, exist_ok=True)
|
|
commands_dir.mkdir(parents=True, exist_ok=True)
|
|
instance = VmInstance(
|
|
vm_id=task_id,
|
|
environment=environment,
|
|
vcpu_count=vcpu_count,
|
|
mem_mib=mem_mib,
|
|
ttl_seconds=ttl_seconds,
|
|
created_at=now,
|
|
expires_at=now + ttl_seconds,
|
|
workdir=runtime_dir,
|
|
network_requested=network,
|
|
allow_host_compat=allow_host_compat,
|
|
)
|
|
instance.metadata["allow_host_compat"] = str(allow_host_compat).lower()
|
|
instance.metadata["workspace_path"] = TASK_WORKSPACE_GUEST_PATH
|
|
instance.metadata["workspace_host_dir"] = str(workspace_dir)
|
|
try:
|
|
with self._lock:
|
|
self._reap_expired_locked(now)
|
|
self._reap_expired_tasks_locked(now)
|
|
active_count = len(self._instances) + self._count_tasks_locked()
|
|
if active_count >= self._max_active_vms:
|
|
raise RuntimeError(
|
|
f"max active VMs reached ({self._max_active_vms}); delete old VMs first"
|
|
)
|
|
self._backend.create(instance)
|
|
with self._lock:
|
|
self._start_instance_locked(instance)
|
|
self._require_guest_exec_or_opt_in(instance)
|
|
if self._runtime_capabilities.supports_guest_exec:
|
|
self._backend.exec(
|
|
instance,
|
|
f"mkdir -p {shlex.quote(TASK_WORKSPACE_GUEST_PATH)}",
|
|
10,
|
|
)
|
|
else:
|
|
instance.metadata["execution_mode"] = "host_compat"
|
|
task = TaskRecord.from_instance(instance)
|
|
self._save_task_locked(task)
|
|
return self._serialize_task(task)
|
|
except Exception:
|
|
if runtime_dir.exists():
|
|
try:
|
|
if instance.state == "started":
|
|
self._backend.stop(instance)
|
|
instance.state = "stopped"
|
|
except Exception:
|
|
pass
|
|
try:
|
|
self._backend.delete(instance)
|
|
except Exception:
|
|
pass
|
|
shutil.rmtree(task_dir, ignore_errors=True)
|
|
raise
|
|
|
|
def exec_task(self, task_id: str, *, command: str, timeout_seconds: int = 30) -> dict[str, Any]:
|
|
if timeout_seconds <= 0:
|
|
raise ValueError("timeout_seconds must be positive")
|
|
with self._lock:
|
|
task = self._load_task_locked(task_id)
|
|
self._ensure_task_not_expired_locked(task, time.time())
|
|
self._refresh_task_liveness_locked(task)
|
|
if task.state != "started":
|
|
raise RuntimeError(f"task {task_id} must be in 'started' state before task_exec")
|
|
instance = task.to_instance(workdir=self._task_runtime_dir(task.task_id))
|
|
exec_result, execution_mode = self._exec_instance(
|
|
instance,
|
|
command=command,
|
|
timeout_seconds=timeout_seconds,
|
|
host_workdir=self._task_workspace_dir(task.task_id),
|
|
guest_cwd=TASK_WORKSPACE_GUEST_PATH,
|
|
)
|
|
with self._lock:
|
|
task = self._load_task_locked(task_id)
|
|
task.state = instance.state
|
|
task.firecracker_pid = instance.firecracker_pid
|
|
task.last_error = instance.last_error
|
|
task.metadata = dict(instance.metadata)
|
|
entry = self._record_task_command_locked(
|
|
task,
|
|
command=command,
|
|
exec_result=exec_result,
|
|
execution_mode=execution_mode,
|
|
cwd=TASK_WORKSPACE_GUEST_PATH,
|
|
)
|
|
self._save_task_locked(task)
|
|
return {
|
|
"task_id": task_id,
|
|
"environment": task.environment,
|
|
"environment_version": task.metadata.get("environment_version"),
|
|
"command": command,
|
|
"stdout": exec_result.stdout,
|
|
"stderr": exec_result.stderr,
|
|
"exit_code": exec_result.exit_code,
|
|
"duration_ms": exec_result.duration_ms,
|
|
"execution_mode": execution_mode,
|
|
"sequence": entry["sequence"],
|
|
"cwd": TASK_WORKSPACE_GUEST_PATH,
|
|
}
|
|
|
|
def status_task(self, task_id: str) -> dict[str, Any]:
|
|
with self._lock:
|
|
task = self._load_task_locked(task_id)
|
|
self._ensure_task_not_expired_locked(task, time.time())
|
|
self._refresh_task_liveness_locked(task)
|
|
self._save_task_locked(task)
|
|
return self._serialize_task(task)
|
|
|
|
def logs_task(self, task_id: str) -> dict[str, Any]:
|
|
with self._lock:
|
|
task = self._load_task_locked(task_id)
|
|
self._ensure_task_not_expired_locked(task, time.time())
|
|
self._refresh_task_liveness_locked(task)
|
|
self._save_task_locked(task)
|
|
entries = self._read_task_logs_locked(task.task_id)
|
|
return {"task_id": task.task_id, "count": len(entries), "entries": entries}
|
|
|
|
def delete_task(self, task_id: str, *, reason: str = "explicit_delete") -> dict[str, Any]:
|
|
with self._lock:
|
|
task = self._load_task_locked(task_id)
|
|
instance = task.to_instance(workdir=self._task_runtime_dir(task.task_id))
|
|
if task.state == "started":
|
|
self._backend.stop(instance)
|
|
task.state = "stopped"
|
|
self._backend.delete(instance)
|
|
shutil.rmtree(self._task_dir(task_id), ignore_errors=True)
|
|
return {"task_id": task_id, "deleted": True, "reason": reason}
|
|
|
|
def _validate_limits(self, *, vcpu_count: int, mem_mib: int, ttl_seconds: int) -> None:
|
|
if not self.MIN_VCPUS <= vcpu_count <= self.MAX_VCPUS:
|
|
raise ValueError(f"vcpu_count must be between {self.MIN_VCPUS} and {self.MAX_VCPUS}")
|
|
if not self.MIN_MEM_MIB <= mem_mib <= self.MAX_MEM_MIB:
|
|
raise ValueError(f"mem_mib must be between {self.MIN_MEM_MIB} and {self.MAX_MEM_MIB}")
|
|
if not self.MIN_TTL_SECONDS <= ttl_seconds <= self.MAX_TTL_SECONDS:
|
|
raise ValueError(
|
|
f"ttl_seconds must be between {self.MIN_TTL_SECONDS} and {self.MAX_TTL_SECONDS}"
|
|
)
|
|
|
|
def _serialize(self, instance: VmInstance) -> dict[str, Any]:
|
|
return {
|
|
"vm_id": instance.vm_id,
|
|
"environment": instance.environment,
|
|
"environment_version": instance.metadata.get("environment_version"),
|
|
"vcpu_count": instance.vcpu_count,
|
|
"mem_mib": instance.mem_mib,
|
|
"ttl_seconds": instance.ttl_seconds,
|
|
"created_at": instance.created_at,
|
|
"expires_at": instance.expires_at,
|
|
"state": instance.state,
|
|
"network_enabled": instance.network is not None,
|
|
"allow_host_compat": instance.allow_host_compat,
|
|
"guest_ip": instance.network.guest_ip if instance.network is not None else None,
|
|
"tap_name": instance.network.tap_name if instance.network is not None else None,
|
|
"execution_mode": instance.metadata.get("execution_mode", "pending"),
|
|
"metadata": instance.metadata,
|
|
}
|
|
|
|
def _serialize_task(self, task: TaskRecord) -> dict[str, Any]:
|
|
return {
|
|
"task_id": task.task_id,
|
|
"environment": task.environment,
|
|
"environment_version": task.metadata.get("environment_version"),
|
|
"vcpu_count": task.vcpu_count,
|
|
"mem_mib": task.mem_mib,
|
|
"ttl_seconds": task.ttl_seconds,
|
|
"created_at": task.created_at,
|
|
"expires_at": task.expires_at,
|
|
"state": task.state,
|
|
"network_enabled": task.network is not None,
|
|
"allow_host_compat": task.allow_host_compat,
|
|
"guest_ip": task.network.guest_ip if task.network is not None else None,
|
|
"tap_name": task.network.tap_name if task.network is not None else None,
|
|
"execution_mode": task.metadata.get("execution_mode", "pending"),
|
|
"workspace_path": TASK_WORKSPACE_GUEST_PATH,
|
|
"command_count": task.command_count,
|
|
"last_command": task.last_command,
|
|
"metadata": task.metadata,
|
|
}
|
|
|
|
def _require_guest_boot_or_opt_in(self, instance: VmInstance) -> None:
|
|
if self._runtime_capabilities.supports_vm_boot or instance.allow_host_compat:
|
|
return
|
|
reason = self._runtime_capabilities.reason or "runtime does not support real VM boot"
|
|
raise RuntimeError(
|
|
"guest boot is unavailable and host compatibility mode is disabled: "
|
|
f"{reason}. Set allow_host_compat=True (CLI: --allow-host-compat) to opt into "
|
|
"host execution."
|
|
)
|
|
|
|
def _require_guest_exec_or_opt_in(self, instance: VmInstance) -> None:
|
|
if self._runtime_capabilities.supports_guest_exec or instance.allow_host_compat:
|
|
return
|
|
reason = self._runtime_capabilities.reason or (
|
|
"runtime does not support guest command execution"
|
|
)
|
|
raise RuntimeError(
|
|
"guest command execution is unavailable and host compatibility mode is disabled: "
|
|
f"{reason}. Set allow_host_compat=True (CLI: --allow-host-compat) to opt into "
|
|
"host execution."
|
|
)
|
|
|
|
def _get_instance_locked(self, vm_id: str) -> VmInstance:
|
|
try:
|
|
return self._instances[vm_id]
|
|
except KeyError as exc:
|
|
raise ValueError(f"vm {vm_id!r} does not exist") from exc
|
|
|
|
def _reap_expired_locked(self, now: float) -> None:
|
|
expired_vm_ids = [
|
|
vm_id for vm_id, inst in self._instances.items() if inst.expires_at <= now
|
|
]
|
|
for vm_id in expired_vm_ids:
|
|
instance = self._instances[vm_id]
|
|
if instance.state == "started":
|
|
self._backend.stop(instance)
|
|
instance.state = "stopped"
|
|
self._backend.delete(instance)
|
|
del self._instances[vm_id]
|
|
|
|
def _ensure_not_expired_locked(self, instance: VmInstance, now: float) -> None:
|
|
if instance.expires_at <= now:
|
|
vm_id = instance.vm_id
|
|
self._reap_expired_locked(now)
|
|
raise RuntimeError(f"vm {vm_id!r} expired and was automatically deleted")
|
|
|
|
def _start_instance_locked(self, instance: VmInstance) -> None:
|
|
if instance.state not in {"created", "stopped"}:
|
|
raise RuntimeError(
|
|
f"vm {instance.vm_id} cannot be started from state {instance.state!r}"
|
|
)
|
|
self._require_guest_boot_or_opt_in(instance)
|
|
if not self._runtime_capabilities.supports_vm_boot:
|
|
instance.metadata["execution_mode"] = "host_compat"
|
|
instance.metadata["boot_mode"] = "compat"
|
|
if self._runtime_capabilities.reason is not None:
|
|
instance.metadata["runtime_reason"] = self._runtime_capabilities.reason
|
|
self._backend.start(instance)
|
|
instance.state = "started"
|
|
|
|
def _exec_instance(
|
|
self,
|
|
instance: VmInstance,
|
|
*,
|
|
command: str,
|
|
timeout_seconds: int,
|
|
host_workdir: Path | None = None,
|
|
guest_cwd: str | None = None,
|
|
) -> tuple[VmExecResult, str]:
|
|
if timeout_seconds <= 0:
|
|
raise ValueError("timeout_seconds must be positive")
|
|
if instance.state != "started":
|
|
raise RuntimeError(f"vm {instance.vm_id} must be in 'started' state before execution")
|
|
self._require_guest_exec_or_opt_in(instance)
|
|
prepared_command = command
|
|
if self._runtime_capabilities.supports_guest_exec:
|
|
prepared_command = _wrap_guest_command(command, cwd=guest_cwd)
|
|
workdir = None
|
|
else:
|
|
instance.metadata["execution_mode"] = "host_compat"
|
|
workdir = host_workdir
|
|
exec_result = self._backend.exec(
|
|
instance,
|
|
prepared_command,
|
|
timeout_seconds,
|
|
workdir=workdir,
|
|
)
|
|
execution_mode = instance.metadata.get("execution_mode", "unknown")
|
|
return exec_result, execution_mode
|
|
|
|
def _task_dir(self, task_id: str) -> Path:
|
|
return self._tasks_dir / task_id
|
|
|
|
def _task_runtime_dir(self, task_id: str) -> Path:
|
|
return self._task_dir(task_id) / TASK_RUNTIME_DIRNAME
|
|
|
|
def _task_workspace_dir(self, task_id: str) -> Path:
|
|
return self._task_dir(task_id) / TASK_WORKSPACE_DIRNAME
|
|
|
|
def _task_commands_dir(self, task_id: str) -> Path:
|
|
return self._task_dir(task_id) / TASK_COMMANDS_DIRNAME
|
|
|
|
def _task_metadata_path(self, task_id: str) -> Path:
|
|
return self._task_dir(task_id) / "task.json"
|
|
|
|
def _count_tasks_locked(self) -> int:
|
|
return sum(1 for _ in self._tasks_dir.glob("*/task.json"))
|
|
|
|
def _load_task_locked(self, task_id: str) -> TaskRecord:
|
|
metadata_path = self._task_metadata_path(task_id)
|
|
if not metadata_path.exists():
|
|
raise ValueError(f"task {task_id!r} does not exist")
|
|
payload = json.loads(metadata_path.read_text(encoding="utf-8"))
|
|
if not isinstance(payload, dict):
|
|
raise RuntimeError(f"task record at {metadata_path} is invalid")
|
|
return TaskRecord.from_payload(payload)
|
|
|
|
def _save_task_locked(self, task: TaskRecord) -> None:
|
|
metadata_path = self._task_metadata_path(task.task_id)
|
|
metadata_path.parent.mkdir(parents=True, exist_ok=True)
|
|
metadata_path.write_text(
|
|
json.dumps(task.to_payload(), indent=2, sort_keys=True),
|
|
encoding="utf-8",
|
|
)
|
|
|
|
def _reap_expired_tasks_locked(self, now: float) -> None:
|
|
for metadata_path in list(self._tasks_dir.glob("*/task.json")):
|
|
payload = json.loads(metadata_path.read_text(encoding="utf-8"))
|
|
if not isinstance(payload, dict):
|
|
shutil.rmtree(metadata_path.parent, ignore_errors=True)
|
|
continue
|
|
task = TaskRecord.from_payload(payload)
|
|
if task.expires_at > now:
|
|
continue
|
|
instance = task.to_instance(workdir=self._task_runtime_dir(task.task_id))
|
|
if task.state == "started":
|
|
self._backend.stop(instance)
|
|
task.state = "stopped"
|
|
self._backend.delete(instance)
|
|
shutil.rmtree(self._task_dir(task.task_id), ignore_errors=True)
|
|
|
|
def _ensure_task_not_expired_locked(self, task: TaskRecord, now: float) -> None:
|
|
if task.expires_at <= now:
|
|
task_id = task.task_id
|
|
self._reap_expired_tasks_locked(now)
|
|
raise RuntimeError(f"task {task_id!r} expired and was automatically deleted")
|
|
|
|
def _refresh_task_liveness_locked(self, task: TaskRecord) -> None:
|
|
if task.state != "started":
|
|
return
|
|
execution_mode = task.metadata.get("execution_mode")
|
|
if execution_mode == "host_compat":
|
|
return
|
|
if _pid_is_running(task.firecracker_pid):
|
|
return
|
|
task.state = "stopped"
|
|
task.firecracker_pid = None
|
|
task.last_error = "backing guest process is no longer running"
|
|
|
|
def _record_task_command_locked(
|
|
self,
|
|
task: TaskRecord,
|
|
*,
|
|
command: str,
|
|
exec_result: VmExecResult,
|
|
execution_mode: str,
|
|
cwd: str,
|
|
) -> dict[str, Any]:
|
|
sequence = task.command_count + 1
|
|
commands_dir = self._task_commands_dir(task.task_id)
|
|
commands_dir.mkdir(parents=True, exist_ok=True)
|
|
base_name = f"{sequence:06d}"
|
|
stdout_path = commands_dir / f"{base_name}.stdout"
|
|
stderr_path = commands_dir / f"{base_name}.stderr"
|
|
record_path = commands_dir / f"{base_name}.json"
|
|
stdout_path.write_text(exec_result.stdout, encoding="utf-8")
|
|
stderr_path.write_text(exec_result.stderr, encoding="utf-8")
|
|
entry: dict[str, Any] = {
|
|
"sequence": sequence,
|
|
"command": command,
|
|
"cwd": cwd,
|
|
"exit_code": exec_result.exit_code,
|
|
"duration_ms": exec_result.duration_ms,
|
|
"execution_mode": execution_mode,
|
|
"stdout_file": stdout_path.name,
|
|
"stderr_file": stderr_path.name,
|
|
"recorded_at": time.time(),
|
|
}
|
|
record_path.write_text(json.dumps(entry, indent=2, sort_keys=True), encoding="utf-8")
|
|
task.command_count = sequence
|
|
task.last_command = {
|
|
"sequence": sequence,
|
|
"command": command,
|
|
"cwd": cwd,
|
|
"exit_code": exec_result.exit_code,
|
|
"duration_ms": exec_result.duration_ms,
|
|
"execution_mode": execution_mode,
|
|
}
|
|
return entry
|
|
|
|
def _read_task_logs_locked(self, task_id: str) -> list[dict[str, Any]]:
|
|
entries: list[dict[str, Any]] = []
|
|
commands_dir = self._task_commands_dir(task_id)
|
|
if not commands_dir.exists():
|
|
return entries
|
|
for record_path in sorted(commands_dir.glob("*.json")):
|
|
payload = json.loads(record_path.read_text(encoding="utf-8"))
|
|
if not isinstance(payload, dict):
|
|
continue
|
|
stdout_name = str(payload.get("stdout_file", ""))
|
|
stderr_name = str(payload.get("stderr_file", ""))
|
|
stdout = ""
|
|
stderr = ""
|
|
if stdout_name != "":
|
|
stdout_path = commands_dir / stdout_name
|
|
if stdout_path.exists():
|
|
stdout = stdout_path.read_text(encoding="utf-8")
|
|
if stderr_name != "":
|
|
stderr_path = commands_dir / stderr_name
|
|
if stderr_path.exists():
|
|
stderr = stderr_path.read_text(encoding="utf-8")
|
|
entry = dict(payload)
|
|
entry["stdout"] = stdout
|
|
entry["stderr"] = stderr
|
|
entries.append(entry)
|
|
return entries
|