banger/internal/daemon/snapshot.go
Thales Maciel 60294e8c90
Fix VM lifecycle issues behind verify.sh
Make the Firecracker and bangerd processes outlive short-lived CLI request contexts so vm create no longer kills the VMM or daemon as soon as the RPC returns.

Fix fresh-VM SSH by flattening the seeded /root work disk when the copied home tree lands under a nested root/ directory, and write a guest sshd override to keep root pubkey auth explicit while debugging.

Harden teardown and smoke diagnostics: verify.sh now reports early Firecracker exit and delete failures directly, while dm snapshot cleanup tolerates already-gone handles and retries busy mapper removal long enough for Firecracker to release the device.

Validation: go test ./..., make build, bash -n verify.sh, direct SSH against a fresh VM, and a live ./verify.sh run that now completes with [verify] ok.
2026-03-17 14:43:09 -03:00

111 lines
2.8 KiB
Go

package daemon
import (
"context"
"errors"
"fmt"
"strings"
"time"
)
type dmSnapshotHandles struct {
BaseLoop string
COWLoop string
DMName string
DMDev string
}
func (d *Daemon) createDMSnapshot(ctx context.Context, rootfsPath, cowPath, dmName string) (handles dmSnapshotHandles, err error) {
defer func() {
if err == nil {
return
}
if cleanupErr := d.cleanupDMSnapshot(context.Background(), handles); cleanupErr != nil {
err = errors.Join(err, cleanupErr)
}
}()
baseBytes, err := d.runner.RunSudo(ctx, "losetup", "-f", "--show", "--read-only", rootfsPath)
if err != nil {
return handles, err
}
handles.BaseLoop = strings.TrimSpace(string(baseBytes))
cowBytes, err := d.runner.RunSudo(ctx, "losetup", "-f", "--show", cowPath)
if err != nil {
return handles, err
}
handles.COWLoop = strings.TrimSpace(string(cowBytes))
sectorsBytes, err := d.runner.RunSudo(ctx, "blockdev", "--getsz", handles.BaseLoop)
if err != nil {
return handles, err
}
sectors := strings.TrimSpace(string(sectorsBytes))
if _, err := d.runner.RunSudo(ctx, "dmsetup", "create", dmName, "--table", fmt.Sprintf("0 %s snapshot %s %s P 8", sectors, handles.BaseLoop, handles.COWLoop)); err != nil {
return handles, err
}
handles.DMName = dmName
handles.DMDev = "/dev/mapper/" + dmName
return handles, nil
}
func (d *Daemon) cleanupDMSnapshot(ctx context.Context, handles dmSnapshotHandles) error {
var cleanupErr error
switch {
case handles.DMName != "":
if err := d.removeDMSnapshot(ctx, handles.DMName); err != nil {
cleanupErr = errors.Join(cleanupErr, err)
}
case handles.DMDev != "":
if err := d.removeDMSnapshot(ctx, handles.DMDev); err != nil {
cleanupErr = errors.Join(cleanupErr, err)
}
}
if handles.COWLoop != "" {
if _, err := d.runner.RunSudo(ctx, "losetup", "-d", handles.COWLoop); err != nil {
if !isMissingSnapshotHandle(err) {
cleanupErr = errors.Join(cleanupErr, err)
}
}
}
if handles.BaseLoop != "" {
if _, err := d.runner.RunSudo(ctx, "losetup", "-d", handles.BaseLoop); err != nil {
if !isMissingSnapshotHandle(err) {
cleanupErr = errors.Join(cleanupErr, err)
}
}
}
return cleanupErr
}
func (d *Daemon) removeDMSnapshot(ctx context.Context, target string) error {
deadline := time.Now().Add(3 * time.Second)
for {
if _, err := d.runner.RunSudo(ctx, "dmsetup", "remove", target); err != nil {
if isMissingSnapshotHandle(err) {
return nil
}
if strings.Contains(err.Error(), "Device or resource busy") && time.Now().Before(deadline) {
time.Sleep(100 * time.Millisecond)
continue
}
return err
}
return nil
}
}
func isMissingSnapshotHandle(err error) bool {
if err == nil {
return false
}
msg := err.Error()
return strings.Contains(msg, "No such device or address") ||
strings.Contains(msg, "not found") ||
strings.Contains(msg, "does not exist")
}