package daemon import ( "context" "errors" "os" "path/filepath" "strconv" "strings" "time" "banger/internal/api" "banger/internal/model" "banger/internal/system" ) func (s *VMService) StartVM(ctx context.Context, idOrName string) (model.VMRecord, error) { return s.withVMLockByRef(ctx, idOrName, func(vm model.VMRecord) (model.VMRecord, error) { image, err := s.store.GetImageByID(ctx, vm.ImageID) if err != nil { return model.VMRecord{}, err } if s.vmAlive(vm) { if s.logger != nil { s.logger.Info("vm already running", vmLogAttrs(vm)...) } return vm, nil } return s.startVMLocked(ctx, vm, image) }) } func (s *VMService) startVMLocked(ctx context.Context, vm model.VMRecord, image model.Image) (_ model.VMRecord, err error) { op := s.beginOperation("vm.start", append(vmLogAttrs(vm), imageLogAttrs(image)...)...) defer func() { if err != nil { err = annotateLogPath(err, vm.Runtime.LogPath) op.fail(err, vmLogAttrs(vm)...) return } op.done(vmLogAttrs(vm)...) }() // Derive per-VM paths/names up front so every step sees the same // values. Shortening vm.ID mirrors how the pre-refactor inline // code did it. shortID := system.ShortID(vm.ID) apiSock := filepath.Join(s.layout.RuntimeDir, "fc-"+shortID+".sock") dmName := "fc-rootfs-" + shortID tapName := "tap-fc-" + shortID if strings.TrimSpace(vm.Runtime.VSockPath) == "" { vm.Runtime.VSockPath = defaultVSockPath(s.layout.RuntimeDir, vm.ID) } if vm.Runtime.VSockCID == 0 { vm.Runtime.VSockCID, err = defaultVSockCID(vm.Runtime.GuestIP) if err != nil { return model.VMRecord{}, err } } live := model.VMHandles{} sc := &startContext{ vm: &vm, image: image, live: &live, apiSock: apiSock, dmName: dmName, tapName: tapName, } if runErr := s.runStartSteps(ctx, op, sc, s.buildStartSteps(op, sc)); runErr != nil { // The step driver already ran rollback in reverse for every // succeeded step. All that's left is to persist the ERROR // state so operators see the failure via `vm show`. Use a // fresh context in case the request ctx is cancelled — DB // writes past this point are recovery, not user-driven. // // The store check is for tests that construct a bare Daemon // without a DB; production always has s.store non-nil. vm.State = model.VMStateError vm.Runtime.State = model.VMStateError vm.Runtime.LastError = runErr.Error() clearRuntimeTeardownState(&vm) s.clearVMHandles(vm) if s.store != nil { _ = s.store.UpsertVM(context.Background(), vm) } return model.VMRecord{}, runErr } return vm, nil } func (s *VMService) StopVM(ctx context.Context, idOrName string) (model.VMRecord, error) { return s.withVMLockByRef(ctx, idOrName, func(vm model.VMRecord) (model.VMRecord, error) { return s.stopVMLocked(ctx, vm) }) } func (s *VMService) stopVMLocked(ctx context.Context, current model.VMRecord) (vm model.VMRecord, err error) { vm = current op := s.beginOperation("vm.stop", "vm_ref", vm.ID) defer func() { if err != nil { op.fail(err, vmLogAttrs(vm)...) return } op.done(vmLogAttrs(vm)...) }() if !s.vmAlive(vm) { op.stage("cleanup_stale_runtime") if err := s.cleanupRuntime(ctx, vm, true); err != nil { return model.VMRecord{}, err } vm.State = model.VMStateStopped vm.Runtime.State = model.VMStateStopped clearRuntimeTeardownState(&vm) s.clearVMHandles(vm) if err := s.store.UpsertVM(ctx, vm); err != nil { return model.VMRecord{}, err } return vm, nil } pid := s.vmHandles(vm.ID).PID op.stage("graceful_shutdown") if err := s.net.sendCtrlAltDel(ctx, vm.Runtime.APISockPath); err != nil { return model.VMRecord{}, err } op.stage("wait_for_exit", "pid", pid) if err := s.net.waitForExit(ctx, pid, vm.Runtime.APISockPath, gracefulShutdownWait); err != nil { if !errors.Is(err, errWaitForExitTimeout) { return model.VMRecord{}, err } op.stage("graceful_shutdown_timeout", "pid", pid) } op.stage("cleanup_runtime") if err := s.cleanupRuntime(ctx, vm, true); err != nil { return model.VMRecord{}, err } vm.State = model.VMStateStopped vm.Runtime.State = model.VMStateStopped clearRuntimeTeardownState(&vm) s.clearVMHandles(vm) system.TouchNow(&vm) if err := s.store.UpsertVM(ctx, vm); err != nil { return model.VMRecord{}, err } return vm, nil } func (s *VMService) KillVM(ctx context.Context, params api.VMKillParams) (model.VMRecord, error) { return s.withVMLockByRef(ctx, params.IDOrName, func(vm model.VMRecord) (model.VMRecord, error) { return s.killVMLocked(ctx, vm, params.Signal) }) } func (s *VMService) killVMLocked(ctx context.Context, current model.VMRecord, signalValue string) (vm model.VMRecord, err error) { vm = current op := s.beginOperation("vm.kill", "vm_ref", vm.ID, "signal", signalValue) defer func() { if err != nil { op.fail(err, vmLogAttrs(vm)...) return } op.done(vmLogAttrs(vm)...) }() if !s.vmAlive(vm) { op.stage("cleanup_stale_runtime") if err := s.cleanupRuntime(ctx, vm, true); err != nil { return model.VMRecord{}, err } vm.State = model.VMStateStopped vm.Runtime.State = model.VMStateStopped clearRuntimeTeardownState(&vm) s.clearVMHandles(vm) if err := s.store.UpsertVM(ctx, vm); err != nil { return model.VMRecord{}, err } return vm, nil } signal := strings.TrimSpace(signalValue) if signal == "" { signal = "TERM" } pid := s.vmHandles(vm.ID).PID op.stage("send_signal", "pid", pid, "signal", signal) if _, err := s.runner.RunSudo(ctx, "kill", "-"+signal, strconv.Itoa(pid)); err != nil { return model.VMRecord{}, err } op.stage("wait_for_exit", "pid", pid) if err := s.net.waitForExit(ctx, pid, vm.Runtime.APISockPath, 30*time.Second); err != nil { if !errors.Is(err, errWaitForExitTimeout) { return model.VMRecord{}, err } op.stage("signal_timeout", "pid", pid, "signal", signal) } op.stage("cleanup_runtime") if err := s.cleanupRuntime(ctx, vm, true); err != nil { return model.VMRecord{}, err } vm.State = model.VMStateStopped vm.Runtime.State = model.VMStateStopped clearRuntimeTeardownState(&vm) s.clearVMHandles(vm) system.TouchNow(&vm) if err := s.store.UpsertVM(ctx, vm); err != nil { return model.VMRecord{}, err } return vm, nil } func (s *VMService) RestartVM(ctx context.Context, idOrName string) (vm model.VMRecord, err error) { op := s.beginOperation("vm.restart", "vm_ref", idOrName) defer func() { if err != nil { op.fail(err, vmLogAttrs(vm)...) return } op.done(vmLogAttrs(vm)...) }() resolved, err := s.FindVM(ctx, idOrName) if err != nil { return model.VMRecord{}, err } return s.withVMLockByID(ctx, resolved.ID, func(vm model.VMRecord) (model.VMRecord, error) { op.stage("stop") vm, err = s.stopVMLocked(ctx, vm) if err != nil { return model.VMRecord{}, err } image, err := s.store.GetImageByID(ctx, vm.ImageID) if err != nil { return model.VMRecord{}, err } op.stage("start", vmLogAttrs(vm)...) return s.startVMLocked(ctx, vm, image) }) } func (s *VMService) DeleteVM(ctx context.Context, idOrName string) (model.VMRecord, error) { return s.withVMLockByRef(ctx, idOrName, func(vm model.VMRecord) (model.VMRecord, error) { return s.deleteVMLocked(ctx, vm) }) } func (s *VMService) deleteVMLocked(ctx context.Context, current model.VMRecord) (vm model.VMRecord, err error) { vm = current op := s.beginOperation("vm.delete", "vm_ref", vm.ID) defer func() { if err != nil { op.fail(err, vmLogAttrs(vm)...) return } op.done(vmLogAttrs(vm)...) }() if s.vmAlive(vm) { pid := s.vmHandles(vm.ID).PID op.stage("kill_running_vm", "pid", pid) _ = s.net.killVMProcess(ctx, pid) } op.stage("cleanup_runtime") if err := s.cleanupRuntime(ctx, vm, false); err != nil { return model.VMRecord{}, err } clearRuntimeTeardownState(&vm) op.stage("delete_store_record") if err := s.store.DeleteVM(ctx, vm.ID); err != nil { return model.VMRecord{}, err } if vm.Runtime.VMDir != "" { op.stage("delete_vm_dir", "vm_dir", vm.Runtime.VMDir) if err := os.RemoveAll(vm.Runtime.VMDir); err != nil { return model.VMRecord{}, err } } // Drop any host-key pins. A future VM reusing this IP or name // would otherwise trip the TOFU mismatch branch in // TOFUHostKeyCallback and fail to connect. removeVMKnownHosts(s.layout.KnownHostsPath, vm, s.logger) return vm, nil }