from __future__ import annotations import inspect import logging import threading import time from typing import Any from config import Config from constants import DEFAULT_CONFIG_PATH, RECORD_TIMEOUT_SEC from diagnostics import ( doctor_command, format_support_line, journalctl_command, self_check_command, verbose_run_command, ) from engine.pipeline import PipelineEngine from recorder import start_recording as start_audio_recording from recorder import stop_recording as stop_audio_recording from stages.asr_whisper import AsrResult, WhisperAsrStage from vocabulary import VocabularyEngine from aman_processing import ( build_editor_stage, build_whisper_model, process_transcript_pipeline, resolve_whisper_model_spec, ) class State: IDLE = "idle" RECORDING = "recording" STT = "stt" PROCESSING = "processing" OUTPUTTING = "outputting" def _log_support_issue( level: int, issue_id: str, message: str, *, next_step: str = "", ) -> None: logging.log(level, format_support_line(issue_id, message, next_step=next_step)) class Daemon: def __init__( self, cfg: Config, desktop, *, verbose: bool = False, config_path=None, ): self.cfg = cfg self.desktop = desktop self.verbose = verbose self.config_path = config_path or DEFAULT_CONFIG_PATH self.lock = threading.Lock() self._shutdown_requested = threading.Event() self._paused = False self.state = State.IDLE self.stream = None self.record = None self.timer: threading.Timer | None = None self.vocabulary = VocabularyEngine(cfg.vocabulary) self._stt_hint_kwargs_cache: dict[str, Any] | None = None self.model = build_whisper_model( resolve_whisper_model_spec(cfg), cfg.stt.device, ) self.asr_stage = WhisperAsrStage( self.model, configured_language=cfg.stt.language, hint_kwargs_provider=self._stt_hint_kwargs, ) logging.info("initializing editor stage (local_llama_builtin)") self.editor_stage = build_editor_stage(cfg, verbose=self.verbose) self._warmup_editor_stage() self.pipeline = PipelineEngine( asr_stage=self.asr_stage, editor_stage=self.editor_stage, vocabulary=self.vocabulary, safety_enabled=cfg.safety.enabled, safety_strict=cfg.safety.strict, ) logging.info("editor stage ready") self.log_transcript = verbose def _arm_cancel_listener(self) -> bool: try: self.desktop.start_cancel_listener(lambda: self.cancel_recording()) return True except Exception as exc: logging.error("failed to start cancel listener: %s", exc) return False def _disarm_cancel_listener(self): try: self.desktop.stop_cancel_listener() except Exception as exc: logging.debug("failed to stop cancel listener: %s", exc) def set_state(self, state: str): with self.lock: prev = self.state self.state = state if prev != state: logging.debug("state: %s -> %s", prev, state) else: logging.debug("redundant state set: %s", state) def get_state(self): with self.lock: return self.state def request_shutdown(self): self._shutdown_requested.set() def is_paused(self) -> bool: with self.lock: return self._paused def toggle_paused(self) -> bool: with self.lock: self._paused = not self._paused paused = self._paused logging.info("pause %s", "enabled" if paused else "disabled") return paused def apply_config(self, cfg: Config) -> None: new_model = build_whisper_model( resolve_whisper_model_spec(cfg), cfg.stt.device, ) new_vocabulary = VocabularyEngine(cfg.vocabulary) new_stt_hint_kwargs_cache: dict[str, Any] | None = None def _hint_kwargs_provider() -> dict[str, Any]: nonlocal new_stt_hint_kwargs_cache if new_stt_hint_kwargs_cache is not None: return new_stt_hint_kwargs_cache hotwords, initial_prompt = new_vocabulary.build_stt_hints() if not hotwords and not initial_prompt: new_stt_hint_kwargs_cache = {} return new_stt_hint_kwargs_cache try: signature = inspect.signature(new_model.transcribe) except (TypeError, ValueError): logging.debug("stt signature inspection failed; skipping hints") new_stt_hint_kwargs_cache = {} return new_stt_hint_kwargs_cache params = signature.parameters kwargs: dict[str, Any] = {} if hotwords and "hotwords" in params: kwargs["hotwords"] = hotwords if initial_prompt and "initial_prompt" in params: kwargs["initial_prompt"] = initial_prompt if not kwargs: logging.debug( "stt hint arguments are not supported by this whisper runtime" ) new_stt_hint_kwargs_cache = kwargs return new_stt_hint_kwargs_cache new_asr_stage = WhisperAsrStage( new_model, configured_language=cfg.stt.language, hint_kwargs_provider=_hint_kwargs_provider, ) new_editor_stage = build_editor_stage(cfg, verbose=self.verbose) new_editor_stage.warmup() new_pipeline = PipelineEngine( asr_stage=new_asr_stage, editor_stage=new_editor_stage, vocabulary=new_vocabulary, safety_enabled=cfg.safety.enabled, safety_strict=cfg.safety.strict, ) with self.lock: self.cfg = cfg self.model = new_model self.vocabulary = new_vocabulary self._stt_hint_kwargs_cache = None self.asr_stage = new_asr_stage self.editor_stage = new_editor_stage self.pipeline = new_pipeline logging.info("applied new runtime config") def toggle(self): should_stop = False with self.lock: if self._shutdown_requested.is_set(): logging.info("shutdown in progress, trigger ignored") return if self.state == State.IDLE: if self._paused: logging.info("paused, trigger ignored") return self._start_recording_locked() return if self.state == State.RECORDING: should_stop = True else: logging.info("busy (%s), trigger ignored", self.state) if should_stop: self.stop_recording(trigger="user") def _start_recording_locked(self): if self.state != State.IDLE: logging.info("busy (%s), trigger ignored", self.state) return try: stream, record = start_audio_recording(self.cfg.recording.input) except Exception as exc: _log_support_issue( logging.ERROR, "audio.input", f"record start failed: {exc}", next_step=( f"run `{doctor_command(self.config_path)}` and verify the " "selected input device" ), ) return if not self._arm_cancel_listener(): try: stream.stop() except Exception: pass try: stream.close() except Exception: pass logging.error( "recording start aborted because cancel listener is unavailable" ) return self.stream = stream self.record = record prev = self.state self.state = State.RECORDING logging.debug("state: %s -> %s", prev, self.state) logging.info("recording started") if self.timer: self.timer.cancel() self.timer = threading.Timer(RECORD_TIMEOUT_SEC, self._timeout_stop) self.timer.daemon = True self.timer.start() def _timeout_stop(self): self.stop_recording(trigger="timeout") def _start_stop_worker( self, stream: Any, record: Any, trigger: str, process_audio: bool ): threading.Thread( target=self._stop_and_process, args=(stream, record, trigger, process_audio), daemon=True, ).start() def _begin_stop_locked(self): if self.state != State.RECORDING: return None stream = self.stream record = self.record self.stream = None self.record = None if self.timer: self.timer.cancel() self.timer = None self._disarm_cancel_listener() prev = self.state self.state = State.STT logging.debug("state: %s -> %s", prev, self.state) if stream is None or record is None: logging.warning("recording resources are unavailable during stop") self.state = State.IDLE return None return stream, record def _stop_and_process( self, stream: Any, record: Any, trigger: str, process_audio: bool ): logging.info("stopping recording (%s)", trigger) try: audio = stop_audio_recording(stream, record) except Exception as exc: _log_support_issue( logging.ERROR, "runtime.audio", f"record stop failed: {exc}", next_step=( f"rerun `{doctor_command(self.config_path)}` and verify the " "audio runtime" ), ) self.set_state(State.IDLE) return if not process_audio or self._shutdown_requested.is_set(): self.set_state(State.IDLE) return if audio.size == 0: _log_support_issue( logging.ERROR, "runtime.audio", "no audio was captured from the active input device", next_step="verify the selected microphone level and rerun diagnostics", ) self.set_state(State.IDLE) return try: logging.info("stt started") asr_result = self._transcribe_with_metrics(audio) except Exception as exc: _log_support_issue( logging.ERROR, "startup.readiness", f"stt failed: {exc}", next_step=( f"run `{self_check_command(self.config_path)}` and then " f"`{verbose_run_command(self.config_path)}`" ), ) self.set_state(State.IDLE) return text = (asr_result.raw_text or "").strip() stt_lang = asr_result.language if not text: self.set_state(State.IDLE) return if self.log_transcript: logging.debug("stt: %s", text) else: logging.info("stt produced %d chars", len(text)) if not self._shutdown_requested.is_set(): self.set_state(State.PROCESSING) logging.info("editor stage started") try: text, _timings = process_transcript_pipeline( text, stt_lang=stt_lang, pipeline=self.pipeline, suppress_ai_errors=False, asr_result=asr_result, asr_ms=asr_result.latency_ms, verbose=self.log_transcript, ) except Exception as exc: _log_support_issue( logging.ERROR, "model.cache", f"editor stage failed: {exc}", next_step=( f"run `{self_check_command(self.config_path)}` and inspect " f"`{journalctl_command()}` if the service keeps failing" ), ) self.set_state(State.IDLE) return if self.log_transcript: logging.debug("processed: %s", text) else: logging.info("processed text length: %d", len(text)) if self._shutdown_requested.is_set(): self.set_state(State.IDLE) return try: self.set_state(State.OUTPUTTING) logging.info("outputting started") backend = self.cfg.injection.backend self.desktop.inject_text( text, backend, remove_transcription_from_clipboard=( self.cfg.injection.remove_transcription_from_clipboard ), ) except Exception as exc: _log_support_issue( logging.ERROR, "injection.backend", f"output failed: {exc}", next_step=( f"run `{doctor_command(self.config_path)}` and then " f"`{verbose_run_command(self.config_path)}`" ), ) finally: self.set_state(State.IDLE) def stop_recording(self, *, trigger: str = "user", process_audio: bool = True): with self.lock: payload = self._begin_stop_locked() if payload is None: return stream, record = payload self._start_stop_worker(stream, record, trigger, process_audio) def cancel_recording(self): with self.lock: if self.state != State.RECORDING: return self.stop_recording(trigger="cancel", process_audio=False) def shutdown(self, timeout: float = 5.0) -> bool: self.request_shutdown() self._disarm_cancel_listener() self.stop_recording(trigger="shutdown", process_audio=False) return self.wait_for_idle(timeout) def wait_for_idle(self, timeout: float) -> bool: end = time.time() + timeout while time.time() < end: if self.get_state() == State.IDLE: return True time.sleep(0.05) return self.get_state() == State.IDLE def _transcribe_with_metrics(self, audio) -> AsrResult: return self.asr_stage.transcribe(audio) def _transcribe(self, audio) -> tuple[str, str]: result = self._transcribe_with_metrics(audio) return result.raw_text, result.language def _warmup_editor_stage(self) -> None: logging.info("warming up editor stage") try: self.editor_stage.warmup() except Exception as exc: if self.cfg.advanced.strict_startup: raise RuntimeError(f"editor stage warmup failed: {exc}") from exc logging.warning( "editor stage warmup failed, continuing because " "advanced.strict_startup=false: %s", exc, ) return logging.info("editor stage warmup completed") def _stt_hint_kwargs(self) -> dict[str, Any]: if self._stt_hint_kwargs_cache is not None: return self._stt_hint_kwargs_cache hotwords, initial_prompt = self.vocabulary.build_stt_hints() if not hotwords and not initial_prompt: self._stt_hint_kwargs_cache = {} return self._stt_hint_kwargs_cache try: signature = inspect.signature(self.model.transcribe) except (TypeError, ValueError): logging.debug("stt signature inspection failed; skipping hints") self._stt_hint_kwargs_cache = {} return self._stt_hint_kwargs_cache params = signature.parameters kwargs: dict[str, Any] = {} if hotwords and "hotwords" in params: kwargs["hotwords"] = hotwords if initial_prompt and "initial_prompt" in params: kwargs["initial_prompt"] = initial_prompt if not kwargs: logging.debug("stt hint arguments are not supported by this whisper runtime") self._stt_hint_kwargs_cache = kwargs return self._stt_hint_kwargs_cache