Enable real guest networking and make demos network-first

This commit is contained in:
Thales Maciel 2026-03-06 22:47:16 -03:00
parent c43c718c83
commit b01efa6452
14 changed files with 618 additions and 72 deletions

View file

@ -4,6 +4,7 @@ from __future__ import annotations
import os
import shutil
import signal
import subprocess
import threading
import time
@ -143,6 +144,7 @@ class FirecrackerBackend(VmBackend): # pragma: no cover
self._runtime_capabilities = runtime_capabilities
self._network_manager = network_manager or TapNetworkManager()
self._guest_exec_client = guest_exec_client or VsockExecClient()
self._processes: dict[str, subprocess.Popen[str]] = {}
if not self._firecracker_bin.exists():
raise RuntimeError(f"bundled firecracker binary not found at {self._firecracker_bin}")
if not self._jailer_bin.exists():
@ -160,10 +162,16 @@ class FirecrackerBackend(VmBackend): # pragma: no cover
f"{artifacts.kernel_image} and {artifacts.rootfs_image}"
)
instance.metadata["kernel_image"] = str(artifacts.kernel_image)
instance.metadata["rootfs_image"] = str(artifacts.rootfs_image)
network = self._network_manager.allocate(instance.vm_id)
instance.network = network
instance.metadata.update(self._network_manager.to_metadata(network))
rootfs_copy = instance.workdir / "rootfs.ext4"
shutil.copy2(artifacts.rootfs_image, rootfs_copy)
instance.metadata["rootfs_image"] = str(rootfs_copy)
if self._network_manager.enabled:
network = self._network_manager.allocate(instance.vm_id)
instance.network = network
instance.metadata.update(self._network_manager.to_metadata(network))
else:
instance.network = None
instance.metadata["network_enabled"] = "false"
except Exception:
shutil.rmtree(instance.workdir, ignore_errors=True)
raise
@ -175,6 +183,12 @@ class FirecrackerBackend(VmBackend): # pragma: no cover
instance.metadata["guest_exec_path"] = str(launch_plan.guest_exec_path)
instance.metadata["guest_cid"] = str(launch_plan.guest_cid)
instance.metadata["guest_exec_port"] = str(launch_plan.vsock_port)
instance.metadata["guest_exec_uds_path"] = str(instance.workdir / "vsock.sock")
serial_log_path = instance.workdir / "serial.log"
firecracker_log_path = instance.workdir / "firecracker.log"
firecracker_log_path.touch()
instance.metadata["serial_log_path"] = str(serial_log_path)
instance.metadata["firecracker_log_path"] = str(firecracker_log_path)
proc = subprocess.run( # noqa: S603
[str(self._firecracker_bin), "--version"],
text=True,
@ -191,14 +205,60 @@ class FirecrackerBackend(VmBackend): # pragma: no cover
if self._runtime_capabilities.reason is not None:
instance.metadata["runtime_reason"] = self._runtime_capabilities.reason
return
instance.metadata["execution_mode"] = "guest_vsock"
with serial_log_path.open("w", encoding="utf-8") as serial_fp:
process = subprocess.Popen( # noqa: S603
[
str(self._firecracker_bin),
"--no-api",
"--config-file",
str(launch_plan.config_path),
"--log-path",
str(firecracker_log_path),
"--level",
"Info",
],
stdout=serial_fp,
stderr=subprocess.STDOUT,
text=True,
)
self._processes[instance.vm_id] = process
time.sleep(2)
if process.poll() is not None:
serial_log = serial_log_path.read_text(encoding="utf-8", errors="ignore")
firecracker_log = firecracker_log_path.read_text(encoding="utf-8", errors="ignore")
self._processes.pop(instance.vm_id, None)
raise RuntimeError(
"firecracker microVM exited during startup: "
f"{(serial_log or firecracker_log).strip()}"
)
instance.firecracker_pid = process.pid
instance.metadata["execution_mode"] = (
"guest_vsock" if self._runtime_capabilities.supports_guest_exec else "host_compat"
)
instance.metadata["boot_mode"] = "native"
def exec(self, instance: VmInstance, command: str, timeout_seconds: int) -> VmExecResult:
if self._runtime_capabilities.supports_guest_exec:
guest_cid = int(instance.metadata["guest_cid"])
port = int(instance.metadata["guest_exec_port"])
response = self._guest_exec_client.exec(guest_cid, port, command, timeout_seconds)
uds_path = instance.metadata.get("guest_exec_uds_path")
deadline = time.monotonic() + min(timeout_seconds, 10)
while True:
try:
response = self._guest_exec_client.exec(
guest_cid,
port,
command,
timeout_seconds,
uds_path=uds_path,
)
break
except (OSError, RuntimeError) as exc:
if time.monotonic() >= deadline:
raise RuntimeError(
f"guest exec transport did not become ready: {exc}"
) from exc
time.sleep(0.2)
return VmExecResult(
stdout=response.stdout,
stderr=response.stderr,
@ -209,9 +269,36 @@ class FirecrackerBackend(VmBackend): # pragma: no cover
return _run_host_command(instance.workdir, command, timeout_seconds)
def stop(self, instance: VmInstance) -> None:
del instance
process = self._processes.pop(instance.vm_id, None)
if process is not None:
process.terminate()
try:
process.wait(timeout=5)
except subprocess.TimeoutExpired:
process.kill()
process.wait(timeout=5)
instance.firecracker_pid = None
return
if instance.firecracker_pid is None:
return
try:
os.kill(instance.firecracker_pid, signal.SIGTERM)
except ProcessLookupError:
instance.firecracker_pid = None
return
deadline = time.monotonic() + 5
while time.monotonic() < deadline:
try:
os.kill(instance.firecracker_pid, 0)
except ProcessLookupError:
instance.firecracker_pid = None
return
time.sleep(0.1)
os.kill(instance.firecracker_pid, signal.SIGKILL)
instance.firecracker_pid = None
def delete(self, instance: VmInstance) -> None:
self._processes.pop(instance.vm_id, None)
if instance.network is not None:
self._network_manager.cleanup(instance.network)
shutil.rmtree(instance.workdir, ignore_errors=True)