Wait for real guest vsock health before opencode
Make vm create wait for the guest-side vsock /healthz endpoint instead of only waiting for the host socket path, so the wait_vsock_agent stage reflects actual guest readiness. Start banger-vsock-agent earlier in the Alpine OpenRC graph and report later /ports failures as guest-service waits rather than vsock-agent waits, which makes the progress output match what the guest is really doing. Validate with go test ./..., a rebuilt managed alpine image, and a fresh vm create --image alpine --name alp --nat that now progresses through wait_vsock_agent -> wait_guest_ready -> wait_opencode -> ready.
This commit is contained in:
parent
a166068fab
commit
092d848620
5 changed files with 157 additions and 4 deletions
|
|
@ -4,6 +4,7 @@ import (
|
||||||
"context"
|
"context"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"log/slog"
|
||||||
"net"
|
"net"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
|
@ -25,6 +26,8 @@ import (
|
||||||
var (
|
var (
|
||||||
errWaitForExitTimeout = errors.New("timed out waiting for VM to exit")
|
errWaitForExitTimeout = errors.New("timed out waiting for VM to exit")
|
||||||
gracefulShutdownWait = 10 * time.Second
|
gracefulShutdownWait = 10 * time.Second
|
||||||
|
vsockReadyWait = 30 * time.Second
|
||||||
|
vsockReadyPoll = 200 * time.Millisecond
|
||||||
)
|
)
|
||||||
|
|
||||||
func (d *Daemon) CreateVM(ctx context.Context, params api.VMCreateParams) (vm model.VMRecord, err error) {
|
func (d *Daemon) CreateVM(ctx context.Context, params api.VMCreateParams) (vm model.VMRecord, err error) {
|
||||||
|
|
@ -314,10 +317,13 @@ func (d *Daemon) startVMLocked(ctx context.Context, vm model.VMRecord, image mod
|
||||||
return cleanupOnErr(err)
|
return cleanupOnErr(err)
|
||||||
}
|
}
|
||||||
op.stage("vsock_access", "vsock_path", vm.Runtime.VSockPath, "vsock_cid", vm.Runtime.VSockCID)
|
op.stage("vsock_access", "vsock_path", vm.Runtime.VSockPath, "vsock_cid", vm.Runtime.VSockCID)
|
||||||
vmCreateStage(ctx, "wait_vsock_agent", "waiting for guest vsock agent")
|
|
||||||
if err := d.ensureSocketAccess(ctx, vm.Runtime.VSockPath, "firecracker vsock socket"); err != nil {
|
if err := d.ensureSocketAccess(ctx, vm.Runtime.VSockPath, "firecracker vsock socket"); err != nil {
|
||||||
return cleanupOnErr(err)
|
return cleanupOnErr(err)
|
||||||
}
|
}
|
||||||
|
vmCreateStage(ctx, "wait_vsock_agent", "waiting for guest vsock agent")
|
||||||
|
if err := waitForGuestVSockAgent(ctx, d.logger, vm.Runtime.VSockPath, vsockReadyWait); err != nil {
|
||||||
|
return cleanupOnErr(err)
|
||||||
|
}
|
||||||
op.stage("post_start_features")
|
op.stage("post_start_features")
|
||||||
vmCreateStage(ctx, "wait_guest_ready", "waiting for guest services")
|
vmCreateStage(ctx, "wait_guest_ready", "waiting for guest services")
|
||||||
if err := d.postStartCapabilities(ctx, vm, image); err != nil {
|
if err := d.postStartCapabilities(ctx, vm, image); err != nil {
|
||||||
|
|
@ -1150,6 +1156,38 @@ func waitForPath(ctx context.Context, path string, timeout time.Duration, label
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func waitForGuestVSockAgent(ctx context.Context, logger *slog.Logger, socketPath string, timeout time.Duration) error {
|
||||||
|
if strings.TrimSpace(socketPath) == "" {
|
||||||
|
return errors.New("vsock path is required")
|
||||||
|
}
|
||||||
|
|
||||||
|
waitCtx, cancel := context.WithTimeout(ctx, timeout)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
ticker := time.NewTicker(vsockReadyPoll)
|
||||||
|
defer ticker.Stop()
|
||||||
|
|
||||||
|
var lastErr error
|
||||||
|
for {
|
||||||
|
pingCtx, pingCancel := context.WithTimeout(waitCtx, 3*time.Second)
|
||||||
|
err := vsockagent.Health(pingCtx, logger, socketPath)
|
||||||
|
pingCancel()
|
||||||
|
if err == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
lastErr = err
|
||||||
|
|
||||||
|
select {
|
||||||
|
case <-waitCtx.Done():
|
||||||
|
if lastErr != nil {
|
||||||
|
return fmt.Errorf("guest vsock agent not ready: %w", lastErr)
|
||||||
|
}
|
||||||
|
return errors.New("guest vsock agent not ready before timeout")
|
||||||
|
case <-ticker.C:
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func (d *Daemon) setDNS(ctx context.Context, vmName, guestIP string) error {
|
func (d *Daemon) setDNS(ctx context.Context, vmName, guestIP string) error {
|
||||||
if d.vmDNS == nil {
|
if d.vmDNS == nil {
|
||||||
return nil
|
return nil
|
||||||
|
|
|
||||||
|
|
@ -415,6 +415,86 @@ func TestPingVMAliasReturnsAliveForHealthyVM(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestWaitForGuestVSockAgentRetriesUntilHealthy(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
socketPath := filepath.Join(t.TempDir(), "fc.vsock")
|
||||||
|
listener, err := net.Listen("unix", socketPath)
|
||||||
|
if err != nil {
|
||||||
|
skipIfSocketRestricted(t, err)
|
||||||
|
t.Fatalf("listen vsock: %v", err)
|
||||||
|
}
|
||||||
|
t.Cleanup(func() {
|
||||||
|
_ = listener.Close()
|
||||||
|
_ = os.Remove(socketPath)
|
||||||
|
})
|
||||||
|
|
||||||
|
serverDone := make(chan error, 1)
|
||||||
|
go func() {
|
||||||
|
for attempt := 0; attempt < 2; attempt++ {
|
||||||
|
conn, err := listener.Accept()
|
||||||
|
if err != nil {
|
||||||
|
serverDone <- err
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
buf := make([]byte, 512)
|
||||||
|
n, err := conn.Read(buf)
|
||||||
|
if err != nil {
|
||||||
|
_ = conn.Close()
|
||||||
|
serverDone <- err
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if got := string(buf[:n]); got != "CONNECT 42070\n" {
|
||||||
|
_ = conn.Close()
|
||||||
|
serverDone <- fmt.Errorf("unexpected connect message %q", got)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if _, err := conn.Write([]byte("OK 1\n")); err != nil {
|
||||||
|
_ = conn.Close()
|
||||||
|
serverDone <- err
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if attempt == 0 {
|
||||||
|
_ = conn.Close()
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
reqBuf := make([]byte, 0, 512)
|
||||||
|
for {
|
||||||
|
n, err = conn.Read(buf)
|
||||||
|
if err != nil {
|
||||||
|
_ = conn.Close()
|
||||||
|
serverDone <- err
|
||||||
|
return
|
||||||
|
}
|
||||||
|
reqBuf = append(reqBuf, buf[:n]...)
|
||||||
|
if strings.Contains(string(reqBuf), "\r\n\r\n") {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if got := string(reqBuf); !strings.Contains(got, "GET /healthz HTTP/1.1\r\n") {
|
||||||
|
_ = conn.Close()
|
||||||
|
serverDone <- fmt.Errorf("unexpected health payload %q", got)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
_, err = conn.Write([]byte("HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: 15\r\n\r\n{\"status\":\"ok\"}"))
|
||||||
|
_ = conn.Close()
|
||||||
|
serverDone <- err
|
||||||
|
return
|
||||||
|
}
|
||||||
|
serverDone <- errors.New("health probe did not retry")
|
||||||
|
}()
|
||||||
|
|
||||||
|
if err := waitForGuestVSockAgent(context.Background(), nil, socketPath, time.Second); err != nil {
|
||||||
|
t.Fatalf("waitForGuestVSockAgent: %v", err)
|
||||||
|
}
|
||||||
|
if err := <-serverDone; err != nil {
|
||||||
|
t.Fatalf("server: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestHealthVMReturnsFalseForStoppedVM(t *testing.T) {
|
func TestHealthVMReturnsFalseForStoppedVM(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -87,7 +87,7 @@ func waitReady(ctx context.Context, logger *slog.Logger, socketPath string, time
|
||||||
lastErr = fmt.Errorf("guest port %d is not listening yet", Port)
|
lastErr = fmt.Errorf("guest port %d is not listening yet", Port)
|
||||||
} else {
|
} else {
|
||||||
if report != nil {
|
if report != nil {
|
||||||
report("wait_vsock_agent", "waiting for guest vsock agent")
|
report("wait_guest_ready", "waiting for guest services")
|
||||||
}
|
}
|
||||||
lastErr = err
|
lastErr = err
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -57,6 +57,7 @@ func TestWaitReadyReturnsWhenPortIsListening(t *testing.T) {
|
||||||
socketPath := filepath.Join(t.TempDir(), "opencode.vsock")
|
socketPath := filepath.Join(t.TempDir(), "opencode.vsock")
|
||||||
listener, err := net.Listen("unix", socketPath)
|
listener, err := net.Listen("unix", socketPath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
skipIfSocketRestricted(t, err)
|
||||||
t.Fatalf("listen: %v", err)
|
t.Fatalf("listen: %v", err)
|
||||||
}
|
}
|
||||||
t.Cleanup(func() {
|
t.Cleanup(func() {
|
||||||
|
|
@ -114,3 +115,37 @@ func TestWaitReadyReturnsWhenPortIsListening(t *testing.T) {
|
||||||
t.Fatalf("server: %v", err)
|
t.Fatalf("server: %v", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestWaitReadyReportsGuestServicesWhenPortsUnavailable(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
var reports []string
|
||||||
|
err := waitReady(
|
||||||
|
context.Background(),
|
||||||
|
nil,
|
||||||
|
filepath.Join(t.TempDir(), "missing.vsock"),
|
||||||
|
50*time.Millisecond,
|
||||||
|
func(stage, detail string) {
|
||||||
|
reports = append(reports, stage+":"+detail)
|
||||||
|
},
|
||||||
|
)
|
||||||
|
if err == nil {
|
||||||
|
t.Fatal("waitReady() error = nil, want timeout")
|
||||||
|
}
|
||||||
|
if len(reports) == 0 {
|
||||||
|
t.Fatal("waitReady() did not report progress")
|
||||||
|
}
|
||||||
|
if got := reports[0]; got != "wait_guest_ready:waiting for guest services" {
|
||||||
|
t.Fatalf("first report = %q, want guest services wait", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func skipIfSocketRestricted(t *testing.T, err error) {
|
||||||
|
t.Helper()
|
||||||
|
if err == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if strings.Contains(strings.ToLower(err.Error()), "operation not permitted") {
|
||||||
|
t.Skipf("socket creation is restricted in this environment: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -225,7 +225,7 @@ description="Banger guest network bootstrap"
|
||||||
|
|
||||||
depend() {
|
depend() {
|
||||||
need localmount
|
need localmount
|
||||||
before sshd docker banger-vsock-agent banger-opencode
|
before sshd docker banger-opencode
|
||||||
provide net
|
provide net
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -265,7 +265,7 @@ command="/usr/local/bin/banger-vsock-agent"
|
||||||
|
|
||||||
depend() {
|
depend() {
|
||||||
need localmount
|
need localmount
|
||||||
after banger-network
|
before banger-network sshd docker banger-opencode
|
||||||
}
|
}
|
||||||
|
|
||||||
start_pre() {
|
start_pre() {
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue