diff --git a/AGENTS.md b/AGENTS.md index f465830..778cac5 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -4,7 +4,7 @@ - `src/leld.py` is the primary entrypoint (X11 STT daemon). - `src/recorder.py` handles audio capture using PortAudio via `sounddevice`. -- `src/stt.py` wraps faster-whisper for STT. +- `src/leld.py` owns Whisper setup and transcription. ## Build, Test, and Development Commands diff --git a/README.md b/README.md index 84bf0bf..3a69e0f 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,6 @@ Create `~/.config/lel/config.json`: "ai_cleanup": { "model": "llama3.2:3b", - "temperature": 0.0, "base_url": "http://localhost:11434", "api_key": "" } diff --git a/src/aiprocess.py b/src/aiprocess.py index 8e8245d..0172089 100644 --- a/src/aiprocess.py +++ b/src/aiprocess.py @@ -5,21 +5,27 @@ import json import logging import sys import urllib.request -from dataclasses import dataclass from pathlib import Path +from dataclasses import dataclass -def load_system_prompt(path: str | None) -> str: - if path: - return Path(path).read_text(encoding="utf-8").strip() - return (Path(__file__).parent / "system_prompt.txt").read_text(encoding="utf-8").strip() +SYSTEM_PROMPT = ( + "You are an amanuensis. Rewrite the user's dictated text into clean, grammatical prose.\n\n" + "Rules:\n" + "- Remove filler words (um/uh/like), false starts, and self-corrections.\n" + "- Keep meaning, facts, and intent.\n" + "- Prefer concise sentences.\n" + "- Do not add new info.\n" + "- Output ONLY the cleaned text, no commentary.\n\n" + "Examples:\n" + " - \"schedule that for 5 PM, I mean 4 PM\" -> \"schedule that for 4 PM\"\n" + " - \"let's ask Bob, I mean Janice, let's ask Janice\" -> \"let's ask Janice\"\n" +) @dataclass class AIConfig: model: str - temperature: float - system_prompt_file: str base_url: str api_key: str timeout_sec: int @@ -30,7 +36,7 @@ class AIConfig: class GenericAPIProcessor: def __init__(self, cfg: AIConfig): self.cfg = cfg - self.system = load_system_prompt(cfg.system_prompt_file) + self.system = SYSTEM_PROMPT def process(self, text: str) -> str: language = self.cfg.language_hint or "" @@ -46,7 +52,7 @@ class GenericAPIProcessor: {"role": "system", "content": self.system}, {"role": "user", "content": user_content}, ], - "temperature": self.cfg.temperature, + "temperature": 0.0, } data = json.dumps(payload).encode("utf-8") url = _chat_completions_url(self.cfg.base_url) @@ -101,6 +107,10 @@ def list_models(base_url: str, api_key: str = "", timeout_sec: int = 10) -> list return [] +def load_system_prompt(_path: str | None = None) -> str: + return SYSTEM_PROMPT + + def _models_url(base_url: str) -> str: root = _root_url(base_url) return root.rstrip("/") + "/v1/models" @@ -149,14 +159,11 @@ def main() -> int: json.dumps(redacted_dict(cfg), indent=2), ) - prompt = load_system_prompt("") - logging.info("system prompt:\n%s", prompt) + logging.info("system prompt:\n%s", SYSTEM_PROMPT) processor = build_processor( AIConfig( model=cfg.ai_cleanup.get("model", ""), - temperature=cfg.ai_cleanup.get("temperature", 0.0), - system_prompt_file="", base_url=cfg.ai_cleanup.get("base_url", ""), api_key=cfg.ai_cleanup.get("api_key", ""), timeout_sec=25, diff --git a/src/config.py b/src/config.py index 1ccda0d..19e41d2 100644 --- a/src/config.py +++ b/src/config.py @@ -12,7 +12,6 @@ class Config: ai_cleanup: dict = field( default_factory=lambda: { "model": "llama3.2:3b", - "temperature": 0.0, "base_url": "http://localhost:11434", "api_key": "", } @@ -30,7 +29,7 @@ def load(path: str | None) -> Config: p = Path(path) if path else default_path() if p.exists(): data = json.loads(p.read_text(encoding="utf-8")) - if any(k in data for k in ("daemon", "recording", "stt", "injection", "ai_cleanup", "ai")): + if any(k in data for k in ("daemon", "recording", "stt", "injection", "ai_cleanup")): for k, v in data.items(): if hasattr(cfg, k): setattr(cfg, k, v) @@ -41,7 +40,6 @@ def load(path: str | None) -> Config: cfg.stt["device"] = data.get("whisper_device", cfg.stt["device"]) cfg.injection["backend"] = data.get("injection_backend", cfg.injection["backend"]) cfg.ai_cleanup["model"] = data.get("ai_model", cfg.ai_cleanup["model"]) - cfg.ai_cleanup["temperature"] = data.get("ai_temperature", cfg.ai_cleanup["temperature"]) cfg.ai_cleanup["base_url"] = data.get("ai_base_url", cfg.ai_cleanup["base_url"]) cfg.ai_cleanup["api_key"] = data.get("ai_api_key", cfg.ai_cleanup["api_key"]) @@ -56,20 +54,9 @@ def load(path: str | None) -> Config: if not isinstance(cfg.ai_cleanup, dict): cfg.ai_cleanup = { "model": "llama3.2:3b", - "temperature": 0.0, "base_url": "http://localhost:11434", "api_key": "", } - legacy_ai = getattr(cfg, "ai", None) - if isinstance(legacy_ai, dict) and not cfg.ai_cleanup: - cfg.ai_cleanup = legacy_ai - try: - delattr(cfg, "ai") - except AttributeError: - pass - except Exception: - pass - validate(cfg) return cfg diff --git a/src/leld.py b/src/leld.py index 6f7e40c..15505e0 100755 --- a/src/leld.py +++ b/src/leld.py @@ -9,13 +9,18 @@ import threading import time from pathlib import Path +import gi +from faster_whisper import WhisperModel + from config import Config, load, redacted_dict from recorder import start_recording, stop_recording -from stt import FasterWhisperSTT, STTConfig from aiprocess import AIConfig, build_processor from inject import inject from x11_hotkey import listen -from tray import run_tray + +gi.require_version("Gtk", "3.0") + +from gi.repository import GLib, Gtk # type: ignore[import-not-found] class State: @@ -26,6 +31,13 @@ class State: OUTPUTTING = "outputting" +def _compute_type(device: str) -> str: + dev = (device or "cpu").lower() + if dev == "cuda": + return "float16" + return "int8" + + class Daemon: def __init__(self, cfg: Config): self.cfg = cfg @@ -34,15 +46,12 @@ class Daemon: self.proc = None self.record = None self.timer = None - self.stt = FasterWhisperSTT( - STTConfig( - model=cfg.stt.get("model", "base"), - language=None, - device=cfg.stt.get("device", "cpu"), - vad_filter=True, - ) + self.model = WhisperModel( + cfg.stt.get("model", "base"), + device=cfg.stt.get("device", "cpu"), + compute_type=_compute_type(cfg.stt.get("device", "cpu")), ) - self.ai = None + self.tray = _Tray(self.get_state, self._quit) def set_state(self, state: str): with self.lock: @@ -55,6 +64,9 @@ class Daemon: with self.lock: return self.state + def _quit(self): + os._exit(0) + def toggle(self): with self.lock: if self.state == State.IDLE: @@ -118,7 +130,7 @@ class Daemon: try: self.set_state(State.STT) logging.info("stt started") - text = self.stt.transcribe(record.wav_path, language="en") + text = self._transcribe(record.wav_path) except Exception as exc: logging.error("stt failed: %s", exc) self.set_state(State.IDLE) @@ -131,7 +143,6 @@ class Daemon: logging.info("stt: %s", text) - ai_prompt_file = "" ai_model = (self.cfg.ai_cleanup.get("model") or "").strip() ai_base_url = (self.cfg.ai_cleanup.get("base_url") or "").strip() if ai_model and ai_base_url: @@ -141,8 +152,6 @@ class Daemon: processor = build_processor( AIConfig( model=ai_model, - temperature=self.cfg.ai_cleanup.get("temperature", 0.0), - system_prompt_file=ai_prompt_file, base_url=ai_base_url, api_key=self.cfg.ai_cleanup.get("api_key", ""), timeout_sec=25, @@ -174,6 +183,69 @@ class Daemon: self.state = State.STT threading.Thread(target=self._stop_and_process, daemon=True).start() + def _transcribe(self, wav_path: str) -> str: + segments, _info = self.model.transcribe( + wav_path, + language=None, + vad_filter=True, + ) + parts = [] + for seg in segments: + text = (seg.text or "").strip() + if text: + parts.append(text) + return " ".join(parts).strip() + + def run_tray(self): + self.tray.run() + + +class _Tray: + def __init__(self, state_getter, on_quit): + self.state_getter = state_getter + self.on_quit = on_quit + self.base = Path(__file__).parent / "assets" + self.icon = Gtk.StatusIcon() + self.icon.set_visible(True) + self.icon.connect("popup-menu", self._on_menu) + self.menu = Gtk.Menu() + quit_item = Gtk.MenuItem(label="Quit") + quit_item.connect("activate", lambda *_: self.on_quit()) + self.menu.append(quit_item) + self.menu.show_all() + + def _on_menu(self, _icon, _button, _time): + self.menu.popup(None, None, None, None, 0, _time) + + def _icon_path(self, state: str) -> str: + if state == State.RECORDING: + return str(self.base / "recording.png") + if state == State.STT: + return str(self.base / "transcribing.png") + if state == State.PROCESSING: + return str(self.base / "processing.png") + return str(self.base / "idle.png") + + def _title(self, state: str) -> str: + if state == State.RECORDING: + return "Recording" + if state == State.STT: + return "STT" + if state == State.PROCESSING: + return "AI Processing" + return "Idle" + + def update(self): + state = self.state_getter() + self.icon.set_from_file(self._icon_path(state)) + self.icon.set_tooltip_text(self._title(state)) + return True + + def run(self): + self.update() + GLib.timeout_add(250, self.update) + Gtk.main() + def _lock_single_instance(): runtime_dir = Path(os.getenv("XDG_RUNTIME_DIR", "/tmp")) / "lel" @@ -199,8 +271,6 @@ def main(): logging.basicConfig(stream=sys.stderr, level=logging.INFO, format="leld: %(asctime)s %(message)s") cfg = load(args.config) - config_path = Path(args.config) if args.config else Path.home() / ".config" / "lel" / "config.json" - _lock_single_instance() logging.info("ready (hotkey: %s)", cfg.daemon.get("hotkey", "")) @@ -208,9 +278,6 @@ def main(): daemon = Daemon(cfg) - def on_quit(): - os._exit(0) - def handle_signal(_sig, _frame): logging.info("signal received, shutting down") daemon.stop_recording() @@ -236,7 +303,7 @@ def main(): ), daemon=True, ).start() - run_tray(daemon.get_state, on_quit, None) + daemon.run_tray() if __name__ == "__main__": diff --git a/src/stt.py b/src/stt.py deleted file mode 100644 index c6bcb18..0000000 --- a/src/stt.py +++ /dev/null @@ -1,48 +0,0 @@ -from __future__ import annotations - -from dataclasses import dataclass - -from faster_whisper import WhisperModel - - -@dataclass -class STTConfig: - model: str - language: str | None - device: str - vad_filter: bool - - -def _compute_type(device: str) -> str: - dev = (device or "cpu").lower() - if dev == "cuda": - return "float16" - return "int8" - - -class FasterWhisperSTT: - def __init__(self, cfg: STTConfig): - self.cfg = cfg - self._model: WhisperModel | None = None - - def _load(self): - if self._model is None: - self._model = WhisperModel( - self.cfg.model, - device=self.cfg.device or "cpu", - compute_type=_compute_type(self.cfg.device), - ) - - def transcribe(self, wav_path: str, language: str | None = None) -> str: - self._load() - segments, _info = self._model.transcribe( # type: ignore[union-attr] - wav_path, - language=language or self.cfg.language, - vad_filter=self.cfg.vad_filter, - ) - parts = [] - for seg in segments: - text = (seg.text or "").strip() - if text: - parts.append(text) - return " ".join(parts).strip() diff --git a/src/system_prompt.txt b/src/system_prompt.txt deleted file mode 100644 index 4adec3d..0000000 --- a/src/system_prompt.txt +++ /dev/null @@ -1,12 +0,0 @@ -You are an amanuensis. Rewrite the user's dictated text into clean, grammatical prose. - -Rules: -- Remove filler words (um/uh/like), false starts, and self-corrections. -- Keep meaning, facts, and intent. -- Prefer concise sentences. -- Do not add new info. -- Output ONLY the cleaned text, no commentary. - -Examples: - - "schedule that for 5 PM, I mean 4 PM" -> "schedule that for 4 PM" - - "let's ask Bob, I mean Janice, let's ask Janice" -> "let's ask Janice" diff --git a/src/tray.py b/src/tray.py deleted file mode 100644 index c7007ad..0000000 --- a/src/tray.py +++ /dev/null @@ -1,62 +0,0 @@ -from __future__ import annotations - -import gi - -gi.require_version("Gtk", "3.0") - -from gi.repository import GLib, Gtk -from pathlib import Path - - -class Tray: - def __init__(self, state_getter, on_quit): - self.state_getter = state_getter - self.on_quit = on_quit - self.base = Path(__file__).parent / "assets" - self.icon = Gtk.StatusIcon() - self.icon.set_visible(True) - self.icon.connect("popup-menu", self._on_menu) - self.menu = Gtk.Menu() - quit_item = Gtk.MenuItem(label="Quit") - quit_item.connect("activate", lambda *_: self.on_quit()) - self.menu.append(quit_item) - self.menu.show_all() - - def _on_menu(self, _icon, _button, _time): - self.menu.popup(None, None, None, None, 0, _time) - - def _icon_path(self, state: str) -> str: - if state == "recording": - return str(self.base / "recording.png") - if state == "stt": - return str(self.base / "transcribing.png") - if state == "processing": - return str(self.base / "processing.png") - return str(self.base / "idle.png") - - def _title(self, state: str) -> str: - if state == "recording": - return "Recording" - if state == "stt": - return "STT" - if state == "processing": - return "AI Processing" - return "Idle" - - def update(self): - state = self.state_getter() - self.icon.set_from_file(self._icon_path(state)) - self.icon.set_tooltip_text(self._title(state)) - return True - - -def run_tray(state_getter, on_quit, on_settings): - tray = Tray(state_getter, on_quit) - tray.update() - GLib.timeout_add(250, tray.update) - if on_settings: - settings_item = Gtk.MenuItem(label="Settings") - settings_item.connect("activate", lambda *_: on_settings()) - tray.menu.prepend(settings_item) - tray.menu.show_all() - Gtk.main()