Rollback partial dm snapshot startup

Prevent partial VM startup failures from leaking loop devices and dm state on the host.

Move root snapshot setup into a rollback-safe helper that records loop and mapper handles incrementally, tears them down in reverse order on failure, and reuses the same dm/loop cleanup path during normal runtime teardown. Also switch the daemon runner field to a small command-runner interface so the snapshot path can be tested with injected failures.

Add failure-injection coverage for losetup, blockdev, dmsetup, partial teardown, and joined rollback errors. Validated with go test ./... and make build.
This commit is contained in:
Thales Maciel 2026-03-16 14:06:17 -03:00
parent 171009b30b
commit 375900cf65
No known key found for this signature in database
GPG key ID: 33112E6833C34679
5 changed files with 401 additions and 47 deletions

View file

@ -153,14 +153,14 @@ func (d *Daemon) startVMLocked(ctx context.Context, vm model.VMRecord, image mod
return model.VMRecord{}, err
}
baseLoop, cowLoop, dmDev, err := d.createDMSnapshot(ctx, image.RootfsPath, vm.Runtime.SystemOverlay, dmName)
handles, err := d.createDMSnapshot(ctx, image.RootfsPath, vm.Runtime.SystemOverlay, dmName)
if err != nil {
return model.VMRecord{}, err
}
vm.Runtime.BaseLoop = baseLoop
vm.Runtime.COWLoop = cowLoop
vm.Runtime.DMName = dmName
vm.Runtime.DMDev = dmDev
vm.Runtime.BaseLoop = handles.BaseLoop
vm.Runtime.COWLoop = handles.COWLoop
vm.Runtime.DMName = handles.DMName
vm.Runtime.DMDev = handles.DMDev
vm.Runtime.APISockPath = apiSock
vm.Runtime.TapDevice = tap
vm.Runtime.State = model.VMStateRunning
@ -171,7 +171,9 @@ func (d *Daemon) startVMLocked(ctx context.Context, vm model.VMRecord, image mod
vm.State = model.VMStateError
vm.Runtime.State = model.VMStateError
vm.Runtime.LastError = err.Error()
_ = d.cleanupRuntime(context.Background(), vm, true)
if cleanupErr := d.cleanupRuntime(context.Background(), vm, true); cleanupErr != nil {
err = errors.Join(err, cleanupErr)
}
clearRuntimeHandles(&vm)
_ = d.store.UpsertVM(context.Background(), vm)
return model.VMRecord{}, err
@ -273,6 +275,7 @@ func (d *Daemon) StopVM(ctx context.Context, idOrName string) (model.VMRecord, e
return vm, nil
}
func (d *Daemon) RestartVM(ctx context.Context, idOrName string) (model.VMRecord, error) {
vm, err := d.StopVM(ctx, idOrName)
if err != nil {
@ -506,28 +509,6 @@ func (d *Daemon) ensureWorkDisk(ctx context.Context, vm *model.VMRecord) error {
return nil
}
func (d *Daemon) createDMSnapshot(ctx context.Context, rootfsPath, cowPath, dmName string) (baseLoop, cowLoop, dmDev string, err error) {
baseBytes, err := d.runner.RunSudo(ctx, "losetup", "-f", "--show", "--read-only", rootfsPath)
if err != nil {
return "", "", "", err
}
baseLoop = strings.TrimSpace(string(baseBytes))
cowBytes, err := d.runner.RunSudo(ctx, "losetup", "-f", "--show", cowPath)
if err != nil {
return "", "", "", err
}
cowLoop = strings.TrimSpace(string(cowBytes))
sectorsBytes, err := d.runner.RunSudo(ctx, "blockdev", "--getsz", baseLoop)
if err != nil {
return "", "", "", err
}
sectors := strings.TrimSpace(string(sectorsBytes))
if _, err := d.runner.RunSudo(ctx, "dmsetup", "create", dmName, "--table", fmt.Sprintf("0 %s snapshot %s %s P 8", sectors, baseLoop, cowLoop)); err != nil {
return "", "", "", err
}
return baseLoop, cowLoop, "/dev/mapper/" + dmName, nil
}
func (d *Daemon) ensureBridge(ctx context.Context) error {
if _, err := d.runner.Run(ctx, "ip", "link", "show", d.config.BridgeName); err == nil {
_, err = d.runner.RunSudo(ctx, "ip", "link", "set", d.config.BridgeName, "up")
@ -638,25 +619,20 @@ func (d *Daemon) cleanupRuntime(ctx context.Context, vm model.VMRecord, preserve
if vm.Runtime.APISockPath != "" {
_ = os.Remove(vm.Runtime.APISockPath)
}
if vm.Runtime.DMName != "" {
_, _ = d.runner.RunSudo(ctx, "dmsetup", "remove", vm.Runtime.DMName)
} else if vm.Runtime.DMDev != "" {
_, _ = d.runner.RunSudo(ctx, "dmsetup", "remove", vm.Runtime.DMDev)
}
if vm.Runtime.COWLoop != "" {
_, _ = d.runner.RunSudo(ctx, "losetup", "-d", vm.Runtime.COWLoop)
}
if vm.Runtime.BaseLoop != "" {
_, _ = d.runner.RunSudo(ctx, "losetup", "-d", vm.Runtime.BaseLoop)
}
snapshotErr := d.cleanupDMSnapshot(ctx, dmSnapshotHandles{
BaseLoop: vm.Runtime.BaseLoop,
COWLoop: vm.Runtime.COWLoop,
DMName: vm.Runtime.DMName,
DMDev: vm.Runtime.DMDev,
})
if vm.Spec.NATEnabled {
_ = d.ensureNAT(ctx, vm, false)
}
_ = d.removeDNS(ctx, vm.Runtime.DNSName)
if !preserveDisks && vm.Runtime.VMDir != "" {
return os.RemoveAll(vm.Runtime.VMDir)
return errors.Join(snapshotErr, os.RemoveAll(vm.Runtime.VMDir))
}
return nil
return snapshotErr
}
func clearRuntimeHandles(vm *model.VMRecord) {