Fix VM lifecycle issues behind verify.sh

Make the Firecracker and bangerd processes outlive short-lived CLI request contexts so vm create no longer kills the VMM or daemon as soon as the RPC returns.

Fix fresh-VM SSH by flattening the seeded /root work disk when the copied home tree lands under a nested root/ directory, and write a guest sshd override to keep root pubkey auth explicit while debugging.

Harden teardown and smoke diagnostics: verify.sh now reports early Firecracker exit and delete failures directly, while dm snapshot cleanup tolerates already-gone handles and retries busy mapper removal long enough for Firecracker to release the device.

Validation: go test ./..., make build, bash -n verify.sh, direct SSH against a fresh VM, and a live ./verify.sh run that now completes with [verify] ok.
This commit is contained in:
Thales Maciel 2026-03-17 14:43:09 -03:00
parent 617f677c9b
commit 60294e8c90
No known key found for this signature in database
GPG key ID: 33112E6833C34679
7 changed files with 149 additions and 21 deletions

View file

@ -5,6 +5,7 @@ import (
"errors"
"fmt"
"strings"
"time"
)
type dmSnapshotHandles struct {
@ -55,25 +56,56 @@ func (d *Daemon) cleanupDMSnapshot(ctx context.Context, handles dmSnapshotHandle
switch {
case handles.DMName != "":
if _, err := d.runner.RunSudo(ctx, "dmsetup", "remove", handles.DMName); err != nil {
if err := d.removeDMSnapshot(ctx, handles.DMName); err != nil {
cleanupErr = errors.Join(cleanupErr, err)
}
case handles.DMDev != "":
if _, err := d.runner.RunSudo(ctx, "dmsetup", "remove", handles.DMDev); err != nil {
if err := d.removeDMSnapshot(ctx, handles.DMDev); err != nil {
cleanupErr = errors.Join(cleanupErr, err)
}
}
if handles.COWLoop != "" {
if _, err := d.runner.RunSudo(ctx, "losetup", "-d", handles.COWLoop); err != nil {
cleanupErr = errors.Join(cleanupErr, err)
if !isMissingSnapshotHandle(err) {
cleanupErr = errors.Join(cleanupErr, err)
}
}
}
if handles.BaseLoop != "" {
if _, err := d.runner.RunSudo(ctx, "losetup", "-d", handles.BaseLoop); err != nil {
cleanupErr = errors.Join(cleanupErr, err)
if !isMissingSnapshotHandle(err) {
cleanupErr = errors.Join(cleanupErr, err)
}
}
}
return cleanupErr
}
func (d *Daemon) removeDMSnapshot(ctx context.Context, target string) error {
deadline := time.Now().Add(3 * time.Second)
for {
if _, err := d.runner.RunSudo(ctx, "dmsetup", "remove", target); err != nil {
if isMissingSnapshotHandle(err) {
return nil
}
if strings.Contains(err.Error(), "Device or resource busy") && time.Now().Before(deadline) {
time.Sleep(100 * time.Millisecond)
continue
}
return err
}
return nil
}
}
func isMissingSnapshotHandle(err error) bool {
if err == nil {
return false
}
msg := err.Error()
return strings.Contains(msg, "No such device or address") ||
strings.Contains(msg, "not found") ||
strings.Contains(msg, "does not exist")
}