Stop and delete could fail with device-mapper busy errors when the persisted Firecracker PID was stale or the kernel needed longer to release the root snapshot. Rediscover a live Firecracker process by API socket during cleanup, kill and wait on that PID instead of trusting only the stored runtime PID, and extend dm snapshot removal retries for transient busy handles. Add daemon regressions for stale-runtime reconcile, rediscovered process cleanup, and repeated busy dm removal. Validate with go test ./..., make build, and a live ./banger vm stop debug-ssh run that now exits cleanly.
111 lines
2.8 KiB
Go
111 lines
2.8 KiB
Go
package daemon
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
type dmSnapshotHandles struct {
|
|
BaseLoop string
|
|
COWLoop string
|
|
DMName string
|
|
DMDev string
|
|
}
|
|
|
|
func (d *Daemon) createDMSnapshot(ctx context.Context, rootfsPath, cowPath, dmName string) (handles dmSnapshotHandles, err error) {
|
|
defer func() {
|
|
if err == nil {
|
|
return
|
|
}
|
|
if cleanupErr := d.cleanupDMSnapshot(context.Background(), handles); cleanupErr != nil {
|
|
err = errors.Join(err, cleanupErr)
|
|
}
|
|
}()
|
|
|
|
baseBytes, err := d.runner.RunSudo(ctx, "losetup", "-f", "--show", "--read-only", rootfsPath)
|
|
if err != nil {
|
|
return handles, err
|
|
}
|
|
handles.BaseLoop = strings.TrimSpace(string(baseBytes))
|
|
|
|
cowBytes, err := d.runner.RunSudo(ctx, "losetup", "-f", "--show", cowPath)
|
|
if err != nil {
|
|
return handles, err
|
|
}
|
|
handles.COWLoop = strings.TrimSpace(string(cowBytes))
|
|
|
|
sectorsBytes, err := d.runner.RunSudo(ctx, "blockdev", "--getsz", handles.BaseLoop)
|
|
if err != nil {
|
|
return handles, err
|
|
}
|
|
sectors := strings.TrimSpace(string(sectorsBytes))
|
|
|
|
if _, err := d.runner.RunSudo(ctx, "dmsetup", "create", dmName, "--table", fmt.Sprintf("0 %s snapshot %s %s P 8", sectors, handles.BaseLoop, handles.COWLoop)); err != nil {
|
|
return handles, err
|
|
}
|
|
handles.DMName = dmName
|
|
handles.DMDev = "/dev/mapper/" + dmName
|
|
return handles, nil
|
|
}
|
|
|
|
func (d *Daemon) cleanupDMSnapshot(ctx context.Context, handles dmSnapshotHandles) error {
|
|
var cleanupErr error
|
|
|
|
switch {
|
|
case handles.DMName != "":
|
|
if err := d.removeDMSnapshot(ctx, handles.DMName); err != nil {
|
|
cleanupErr = errors.Join(cleanupErr, err)
|
|
}
|
|
case handles.DMDev != "":
|
|
if err := d.removeDMSnapshot(ctx, handles.DMDev); err != nil {
|
|
cleanupErr = errors.Join(cleanupErr, err)
|
|
}
|
|
}
|
|
|
|
if handles.COWLoop != "" {
|
|
if _, err := d.runner.RunSudo(ctx, "losetup", "-d", handles.COWLoop); err != nil {
|
|
if !isMissingSnapshotHandle(err) {
|
|
cleanupErr = errors.Join(cleanupErr, err)
|
|
}
|
|
}
|
|
}
|
|
if handles.BaseLoop != "" {
|
|
if _, err := d.runner.RunSudo(ctx, "losetup", "-d", handles.BaseLoop); err != nil {
|
|
if !isMissingSnapshotHandle(err) {
|
|
cleanupErr = errors.Join(cleanupErr, err)
|
|
}
|
|
}
|
|
}
|
|
|
|
return cleanupErr
|
|
}
|
|
|
|
func (d *Daemon) removeDMSnapshot(ctx context.Context, target string) error {
|
|
deadline := time.Now().Add(15 * time.Second)
|
|
for {
|
|
if _, err := d.runner.RunSudo(ctx, "dmsetup", "remove", target); err != nil {
|
|
if isMissingSnapshotHandle(err) {
|
|
return nil
|
|
}
|
|
if strings.Contains(err.Error(), "Device or resource busy") && time.Now().Before(deadline) {
|
|
time.Sleep(100 * time.Millisecond)
|
|
continue
|
|
}
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
}
|
|
|
|
func isMissingSnapshotHandle(err error) bool {
|
|
if err == nil {
|
|
return false
|
|
}
|
|
msg := err.Error()
|
|
return strings.Contains(msg, "No such device or address") ||
|
|
strings.Contains(msg, "not found") ||
|
|
strings.Contains(msg, "does not exist")
|
|
}
|