Fix VM lifecycle issues behind verify.sh
Make the Firecracker and bangerd processes outlive short-lived CLI request contexts so vm create no longer kills the VMM or daemon as soon as the RPC returns. Fix fresh-VM SSH by flattening the seeded /root work disk when the copied home tree lands under a nested root/ directory, and write a guest sshd override to keep root pubkey auth explicit while debugging. Harden teardown and smoke diagnostics: verify.sh now reports early Firecracker exit and delete failures directly, while dm snapshot cleanup tolerates already-gone handles and retries busy mapper removal long enough for Firecracker to release the device. Validation: go test ./..., make build, bash -n verify.sh, direct SSH against a fresh VM, and a live ./verify.sh run that now completes with [verify] ok.
This commit is contained in:
parent
617f677c9b
commit
60294e8c90
7 changed files with 149 additions and 21 deletions
|
|
@ -238,7 +238,8 @@ func (d *Daemon) startVMLocked(ctx context.Context, vm model.VMRecord, image mod
|
|||
return cleanupOnErr(err)
|
||||
}
|
||||
op.stage("firecracker_launch", "log_path", vm.Runtime.LogPath, "metrics_path", vm.Runtime.MetricsPath)
|
||||
machine, err := firecracker.NewMachine(ctx, firecracker.MachineConfig{
|
||||
firecrackerCtx := context.Background()
|
||||
machine, err := firecracker.NewMachine(firecrackerCtx, firecracker.MachineConfig{
|
||||
BinaryPath: fcPath,
|
||||
VMID: vm.ID,
|
||||
SocketPath: apiSock,
|
||||
|
|
@ -257,11 +258,11 @@ func (d *Daemon) startVMLocked(ctx context.Context, vm model.VMRecord, image mod
|
|||
if err != nil {
|
||||
return cleanupOnErr(err)
|
||||
}
|
||||
if err := machine.Start(ctx); err != nil {
|
||||
vm.Runtime.PID = d.resolveFirecrackerPID(ctx, machine, apiSock)
|
||||
if err := machine.Start(firecrackerCtx); err != nil {
|
||||
vm.Runtime.PID = d.resolveFirecrackerPID(firecrackerCtx, machine, apiSock)
|
||||
return cleanupOnErr(err)
|
||||
}
|
||||
vm.Runtime.PID = d.resolveFirecrackerPID(ctx, machine, apiSock)
|
||||
vm.Runtime.PID = d.resolveFirecrackerPID(firecrackerCtx, machine, apiSock)
|
||||
op.debugStage("firecracker_started", "pid", vm.Runtime.PID)
|
||||
op.stage("socket_access", "api_socket", apiSock)
|
||||
if err := d.ensureSocketAccess(ctx, apiSock); err != nil {
|
||||
|
|
@ -640,16 +641,25 @@ func (d *Daemon) patchRootOverlay(ctx context.Context, vm model.VMRecord, image
|
|||
resolv := []byte(fmt.Sprintf("nameserver %s\n", d.config.DefaultDNS))
|
||||
hostname := []byte(vm.Name + "\n")
|
||||
hosts := []byte(fmt.Sprintf("127.0.0.1 localhost\n127.0.1.1 %s\n", vm.Name))
|
||||
sshdConfig := []byte(strings.Join([]string{
|
||||
"LogLevel DEBUG3",
|
||||
"PermitRootLogin yes",
|
||||
"PubkeyAuthentication yes",
|
||||
"AuthorizedKeysFile /root/.ssh/authorized_keys",
|
||||
"StrictModes no",
|
||||
"",
|
||||
}, "\n"))
|
||||
fstab, err := system.ReadDebugFSText(ctx, d.runner, vm.Runtime.DMDev, "/etc/fstab")
|
||||
if err != nil {
|
||||
fstab = ""
|
||||
}
|
||||
newFSTab := system.UpdateFSTab(fstab)
|
||||
for guestPath, data := range map[string][]byte{
|
||||
"/etc/resolv.conf": resolv,
|
||||
"/etc/hostname": hostname,
|
||||
"/etc/hosts": hosts,
|
||||
"/etc/fstab": []byte(newFSTab),
|
||||
"/etc/resolv.conf": resolv,
|
||||
"/etc/hostname": hostname,
|
||||
"/etc/hosts": hosts,
|
||||
"/etc/fstab": []byte(newFSTab),
|
||||
"/etc/ssh/sshd_config.d/99-banger.conf": sshdConfig,
|
||||
} {
|
||||
if err := system.WriteExt4File(ctx, d.runner, vm.Runtime.DMDev, guestPath, data); err != nil {
|
||||
return err
|
||||
|
|
@ -681,9 +691,31 @@ func (d *Daemon) ensureWorkDisk(ctx context.Context, vm *model.VMRecord) error {
|
|||
if err := system.CopyDirContents(ctx, d.runner, filepath.Join(rootMount, "root"), workMount, true); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := d.flattenNestedWorkHome(ctx, workMount); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (d *Daemon) flattenNestedWorkHome(ctx context.Context, workMount string) error {
|
||||
nestedHome := filepath.Join(workMount, "root")
|
||||
if !exists(nestedHome) {
|
||||
return nil
|
||||
}
|
||||
script := `set -e
|
||||
src="$1"
|
||||
dst="$2"
|
||||
for path in "$src"/.[!.]* "$src"/..?* "$src"/*; do
|
||||
[ -e "$path" ] || continue
|
||||
cp -a "$path" "$dst"/
|
||||
done`
|
||||
if _, err := d.runner.RunSudo(ctx, "sh", "-c", script, "sh", nestedHome, workMount); err != nil {
|
||||
return err
|
||||
}
|
||||
_, err := d.runner.RunSudo(ctx, "rm", "-rf", nestedHome)
|
||||
return err
|
||||
}
|
||||
|
||||
func (d *Daemon) ensureBridge(ctx context.Context) error {
|
||||
if _, err := d.runner.Run(ctx, "ip", "link", "show", d.config.BridgeName); err == nil {
|
||||
_, err = d.runner.RunSudo(ctx, "ip", "link", "set", d.config.BridgeName, "up")
|
||||
|
|
@ -790,6 +822,9 @@ func (d *Daemon) cleanupRuntime(ctx context.Context, vm model.VMRecord, preserve
|
|||
}
|
||||
if vm.Runtime.PID > 0 && system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) {
|
||||
_ = d.killVMProcess(ctx, vm.Runtime.PID)
|
||||
if err := d.waitForExit(ctx, vm.Runtime.PID, vm.Runtime.APISockPath, 30*time.Second); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if vm.Runtime.TapDevice != "" {
|
||||
_, _ = d.runner.RunSudo(ctx, "ip", "link", "del", vm.Runtime.TapDevice)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue