package daemon import ( "context" "errors" "os" "path/filepath" "strconv" "strings" "time" "banger/internal/api" "banger/internal/firecracker" "banger/internal/model" "banger/internal/system" ) func (d *Daemon) StartVM(ctx context.Context, idOrName string) (model.VMRecord, error) { return d.withVMLockByRef(ctx, idOrName, func(vm model.VMRecord) (model.VMRecord, error) { image, err := d.store.GetImageByID(ctx, vm.ImageID) if err != nil { return model.VMRecord{}, err } if vm.State == model.VMStateRunning && system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) { if d.logger != nil { d.logger.Info("vm already running", vmLogAttrs(vm)...) } return vm, nil } return d.startVMLocked(ctx, vm, image) }) } func (d *Daemon) startVMLocked(ctx context.Context, vm model.VMRecord, image model.Image) (_ model.VMRecord, err error) { op := d.beginOperation("vm.start", append(vmLogAttrs(vm), imageLogAttrs(image)...)...) defer func() { if err != nil { err = annotateLogPath(err, vm.Runtime.LogPath) op.fail(err, vmLogAttrs(vm)...) return } op.done(vmLogAttrs(vm)...) }() op.stage("preflight") vmCreateStage(ctx, "preflight", "checking host prerequisites") if err := d.validateStartPrereqs(ctx, vm, image); err != nil { return model.VMRecord{}, err } if err := os.MkdirAll(vm.Runtime.VMDir, 0o755); err != nil { return model.VMRecord{}, err } op.stage("cleanup_runtime") if err := d.cleanupRuntime(ctx, vm, true); err != nil { return model.VMRecord{}, err } clearRuntimeHandles(&vm) op.stage("bridge") if err := d.ensureBridge(ctx); err != nil { return model.VMRecord{}, err } op.stage("socket_dir") if err := d.ensureSocketDir(); err != nil { return model.VMRecord{}, err } shortID := system.ShortID(vm.ID) apiSock := filepath.Join(d.layout.RuntimeDir, "fc-"+shortID+".sock") dmName := "fc-rootfs-" + shortID tapName := "tap-fc-" + shortID if strings.TrimSpace(vm.Runtime.VSockPath) == "" { vm.Runtime.VSockPath = defaultVSockPath(d.layout.RuntimeDir, vm.ID) } if vm.Runtime.VSockCID == 0 { vm.Runtime.VSockCID, err = defaultVSockCID(vm.Runtime.GuestIP) if err != nil { return model.VMRecord{}, err } } if err := os.RemoveAll(apiSock); err != nil && !os.IsNotExist(err) { return model.VMRecord{}, err } if err := os.RemoveAll(vm.Runtime.VSockPath); err != nil && !os.IsNotExist(err) { return model.VMRecord{}, err } op.stage("system_overlay", "overlay_path", vm.Runtime.SystemOverlay) vmCreateStage(ctx, "prepare_rootfs", "preparing system overlay") if err := d.ensureSystemOverlay(ctx, &vm); err != nil { return model.VMRecord{}, err } op.stage("dm_snapshot", "dm_name", dmName) vmCreateStage(ctx, "prepare_rootfs", "creating root filesystem snapshot") handles, err := d.createDMSnapshot(ctx, image.RootfsPath, vm.Runtime.SystemOverlay, dmName) if err != nil { return model.VMRecord{}, err } vm.Runtime.BaseLoop = handles.BaseLoop vm.Runtime.COWLoop = handles.COWLoop vm.Runtime.DMName = handles.DMName vm.Runtime.DMDev = handles.DMDev vm.Runtime.APISockPath = apiSock vm.Runtime.State = model.VMStateRunning vm.State = model.VMStateRunning vm.Runtime.LastError = "" cleanupOnErr := func(err error) (model.VMRecord, error) { vm.State = model.VMStateError vm.Runtime.State = model.VMStateError vm.Runtime.LastError = err.Error() op.stage("cleanup_after_failure", "error", err.Error()) if cleanupErr := d.cleanupRuntime(context.Background(), vm, true); cleanupErr != nil { err = errors.Join(err, cleanupErr) } clearRuntimeHandles(&vm) _ = d.store.UpsertVM(context.Background(), vm) return model.VMRecord{}, err } op.stage("patch_root_overlay") vmCreateStage(ctx, "prepare_rootfs", "writing guest configuration") if err := d.patchRootOverlay(ctx, vm, image); err != nil { return cleanupOnErr(err) } op.stage("prepare_host_features") vmCreateStage(ctx, "prepare_host_features", "preparing host-side vm features") if err := d.prepareCapabilityHosts(ctx, &vm, image); err != nil { return cleanupOnErr(err) } op.stage("tap") tap, err := d.acquireTap(ctx, tapName) if err != nil { return cleanupOnErr(err) } vm.Runtime.TapDevice = tap op.stage("metrics_file", "metrics_path", vm.Runtime.MetricsPath) if err := os.WriteFile(vm.Runtime.MetricsPath, nil, 0o644); err != nil { return cleanupOnErr(err) } op.stage("firecracker_binary") fcPath, err := d.firecrackerBinary() if err != nil { return cleanupOnErr(err) } op.stage("firecracker_launch", "log_path", vm.Runtime.LogPath, "metrics_path", vm.Runtime.MetricsPath) vmCreateStage(ctx, "boot_firecracker", "starting firecracker") machineConfig := firecracker.MachineConfig{ BinaryPath: fcPath, VMID: vm.ID, SocketPath: apiSock, LogPath: vm.Runtime.LogPath, MetricsPath: vm.Runtime.MetricsPath, KernelImagePath: image.KernelPath, InitrdPath: image.InitrdPath, KernelArgs: system.BuildBootArgs(vm.Name), Drives: []firecracker.DriveConfig{{ ID: "rootfs", Path: vm.Runtime.DMDev, ReadOnly: false, IsRoot: true, }}, TapDevice: tap, VSockPath: vm.Runtime.VSockPath, VSockCID: vm.Runtime.VSockCID, VCPUCount: vm.Spec.VCPUCount, MemoryMiB: vm.Spec.MemoryMiB, Logger: d.logger, } d.contributeMachineConfig(&machineConfig, vm, image) machine, err := firecracker.NewMachine(ctx, machineConfig) if err != nil { return cleanupOnErr(err) } if err := machine.Start(ctx); err != nil { // Use a fresh context: the request ctx may already be cancelled (client // disconnect), but we still need the PID so cleanupRuntime can kill the // Firecracker process that was spawned before the failure. vm.Runtime.PID = d.resolveFirecrackerPID(context.Background(), machine, apiSock) return cleanupOnErr(err) } vm.Runtime.PID = d.resolveFirecrackerPID(context.Background(), machine, apiSock) op.debugStage("firecracker_started", "pid", vm.Runtime.PID) op.stage("socket_access", "api_socket", apiSock) if err := d.ensureSocketAccess(ctx, apiSock, "firecracker api socket"); err != nil { return cleanupOnErr(err) } op.stage("vsock_access", "vsock_path", vm.Runtime.VSockPath, "vsock_cid", vm.Runtime.VSockCID) if err := d.ensureSocketAccess(ctx, vm.Runtime.VSockPath, "firecracker vsock socket"); err != nil { return cleanupOnErr(err) } vmCreateStage(ctx, "wait_vsock_agent", "waiting for guest vsock agent") if err := waitForGuestVSockAgent(ctx, d.logger, vm.Runtime.VSockPath, vsockReadyWait); err != nil { return cleanupOnErr(err) } op.stage("post_start_features") vmCreateStage(ctx, "wait_guest_ready", "waiting for guest services") if err := d.postStartCapabilities(ctx, vm, image); err != nil { return cleanupOnErr(err) } system.TouchNow(&vm) op.stage("persist") vmCreateStage(ctx, "finalize", "saving vm state") if err := d.store.UpsertVM(ctx, vm); err != nil { return cleanupOnErr(err) } return vm, nil } func (d *Daemon) StopVM(ctx context.Context, idOrName string) (model.VMRecord, error) { return d.withVMLockByRef(ctx, idOrName, func(vm model.VMRecord) (model.VMRecord, error) { return d.stopVMLocked(ctx, vm) }) } func (d *Daemon) stopVMLocked(ctx context.Context, current model.VMRecord) (vm model.VMRecord, err error) { vm = current op := d.beginOperation("vm.stop", "vm_ref", vm.ID) defer func() { if err != nil { op.fail(err, vmLogAttrs(vm)...) return } op.done(vmLogAttrs(vm)...) }() if vm.State != model.VMStateRunning || !system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) { op.stage("cleanup_stale_runtime") if err := d.cleanupRuntime(ctx, vm, true); err != nil { return model.VMRecord{}, err } vm.State = model.VMStateStopped vm.Runtime.State = model.VMStateStopped clearRuntimeHandles(&vm) if err := d.store.UpsertVM(ctx, vm); err != nil { return model.VMRecord{}, err } return vm, nil } op.stage("graceful_shutdown") if err := d.sendCtrlAltDel(ctx, vm); err != nil { return model.VMRecord{}, err } op.stage("wait_for_exit", "pid", vm.Runtime.PID) if err := d.waitForExit(ctx, vm.Runtime.PID, vm.Runtime.APISockPath, gracefulShutdownWait); err != nil { if !errors.Is(err, errWaitForExitTimeout) { return model.VMRecord{}, err } op.stage("graceful_shutdown_timeout", "pid", vm.Runtime.PID) } op.stage("cleanup_runtime") if err := d.cleanupRuntime(ctx, vm, true); err != nil { return model.VMRecord{}, err } vm.State = model.VMStateStopped vm.Runtime.State = model.VMStateStopped clearRuntimeHandles(&vm) system.TouchNow(&vm) if err := d.store.UpsertVM(ctx, vm); err != nil { return model.VMRecord{}, err } return vm, nil } func (d *Daemon) KillVM(ctx context.Context, params api.VMKillParams) (model.VMRecord, error) { return d.withVMLockByRef(ctx, params.IDOrName, func(vm model.VMRecord) (model.VMRecord, error) { return d.killVMLocked(ctx, vm, params.Signal) }) } func (d *Daemon) killVMLocked(ctx context.Context, current model.VMRecord, signalValue string) (vm model.VMRecord, err error) { vm = current op := d.beginOperation("vm.kill", "vm_ref", vm.ID, "signal", signalValue) defer func() { if err != nil { op.fail(err, vmLogAttrs(vm)...) return } op.done(vmLogAttrs(vm)...) }() if vm.State != model.VMStateRunning || !system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) { op.stage("cleanup_stale_runtime") if err := d.cleanupRuntime(ctx, vm, true); err != nil { return model.VMRecord{}, err } vm.State = model.VMStateStopped vm.Runtime.State = model.VMStateStopped clearRuntimeHandles(&vm) if err := d.store.UpsertVM(ctx, vm); err != nil { return model.VMRecord{}, err } return vm, nil } signal := strings.TrimSpace(signalValue) if signal == "" { signal = "TERM" } op.stage("send_signal", "pid", vm.Runtime.PID, "signal", signal) if _, err := d.runner.RunSudo(ctx, "kill", "-"+signal, strconv.Itoa(vm.Runtime.PID)); err != nil { return model.VMRecord{}, err } op.stage("wait_for_exit", "pid", vm.Runtime.PID) if err := d.waitForExit(ctx, vm.Runtime.PID, vm.Runtime.APISockPath, 30*time.Second); err != nil { if !errors.Is(err, errWaitForExitTimeout) { return model.VMRecord{}, err } op.stage("signal_timeout", "pid", vm.Runtime.PID, "signal", signal) } op.stage("cleanup_runtime") if err := d.cleanupRuntime(ctx, vm, true); err != nil { return model.VMRecord{}, err } vm.State = model.VMStateStopped vm.Runtime.State = model.VMStateStopped clearRuntimeHandles(&vm) system.TouchNow(&vm) if err := d.store.UpsertVM(ctx, vm); err != nil { return model.VMRecord{}, err } return vm, nil } func (d *Daemon) RestartVM(ctx context.Context, idOrName string) (vm model.VMRecord, err error) { op := d.beginOperation("vm.restart", "vm_ref", idOrName) defer func() { if err != nil { op.fail(err, vmLogAttrs(vm)...) return } op.done(vmLogAttrs(vm)...) }() resolved, err := d.FindVM(ctx, idOrName) if err != nil { return model.VMRecord{}, err } return d.withVMLockByID(ctx, resolved.ID, func(vm model.VMRecord) (model.VMRecord, error) { op.stage("stop") vm, err = d.stopVMLocked(ctx, vm) if err != nil { return model.VMRecord{}, err } image, err := d.store.GetImageByID(ctx, vm.ImageID) if err != nil { return model.VMRecord{}, err } op.stage("start", vmLogAttrs(vm)...) return d.startVMLocked(ctx, vm, image) }) } func (d *Daemon) DeleteVM(ctx context.Context, idOrName string) (model.VMRecord, error) { return d.withVMLockByRef(ctx, idOrName, func(vm model.VMRecord) (model.VMRecord, error) { return d.deleteVMLocked(ctx, vm) }) } func (d *Daemon) deleteVMLocked(ctx context.Context, current model.VMRecord) (vm model.VMRecord, err error) { vm = current op := d.beginOperation("vm.delete", "vm_ref", vm.ID) defer func() { if err != nil { op.fail(err, vmLogAttrs(vm)...) return } op.done(vmLogAttrs(vm)...) }() if vm.State == model.VMStateRunning && system.ProcessRunning(vm.Runtime.PID, vm.Runtime.APISockPath) { op.stage("kill_running_vm", "pid", vm.Runtime.PID) _ = d.killVMProcess(ctx, vm.Runtime.PID) } op.stage("cleanup_runtime") if err := d.cleanupRuntime(ctx, vm, false); err != nil { return model.VMRecord{}, err } op.stage("delete_store_record") if err := d.store.DeleteVM(ctx, vm.ID); err != nil { return model.VMRecord{}, err } if vm.Runtime.VMDir != "" { op.stage("delete_vm_dir", "vm_dir", vm.Runtime.VMDir) if err := os.RemoveAll(vm.Runtime.VMDir); err != nil { return model.VMRecord{}, err } } return vm, nil }