From 8c68719041d27d8f85df9262479f7822cb9ea187 Mon Sep 17 00:00:00 2001 From: Thales Maciel Date: Tue, 24 Feb 2026 11:15:48 -0300 Subject: [PATCH] Clean up config and STT naming --- AGENTS.md | 8 ++--- README.md | 15 ++------- pyproject.toml | 2 +- src/aiprocess.py | 3 -- src/config.py | 58 ++++---------------------------- src/leld.py | 38 ++++++++++----------- src/{transcription.py => stt.py} | 8 ++--- src/tray.py | 6 ++-- systemd/lel.service | 2 +- 9 files changed, 42 insertions(+), 98 deletions(-) rename src/{transcription.py => stt.py} (87%) diff --git a/AGENTS.md b/AGENTS.md index 385dfa5..f465830 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -2,9 +2,9 @@ ## Project Structure & Module Organization -- `src/leld.py` is the primary entrypoint (X11 transcription daemon). +- `src/leld.py` is the primary entrypoint (X11 STT daemon). - `src/recorder.py` handles audio capture using PortAudio via `sounddevice`. -- `src/transcription.py` wraps faster-whisper for transcription. +- `src/stt.py` wraps faster-whisper for STT. ## Build, Test, and Development Commands @@ -29,5 +29,5 @@ ## Configuration Tips -- Audio input is controlled via `WHISPER_FFMPEG_IN` (device index or name). -- Model, language, device, and extra args can be set with `WHISPER_MODEL`, `WHISPER_LANG`, `WHISPER_DEVICE`, and `WHISPER_EXTRA_ARGS`. +- Audio input is configured via the `recording.input` field in `config.json`. +- STT model and device are configured via the `stt` section in `config.json`. diff --git a/README.md b/README.md index 908276b..84bf0bf 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # lel -Python X11 transcription daemon that records audio, runs Whisper, logs the transcript, and can optionally run AI post-processing before injecting text. +Python X11 STT daemon that records audio, runs Whisper, logs the transcript, and can optionally run AI post-processing before injecting text. ## Requirements @@ -35,11 +35,10 @@ Create `~/.config/lel/config.json`: { "daemon": { "hotkey": "Cmd+m" }, "recording": { "input": "0" }, - "transcription": { "model": "base", "device": "cpu" }, + "stt": { "model": "base", "device": "cpu" }, "injection": { "backend": "clipboard" }, "ai_cleanup": { - "enabled": true, "model": "llama3.2:3b", "temperature": 0.0, "base_url": "http://localhost:11434", @@ -48,14 +47,6 @@ Create `~/.config/lel/config.json`: } ``` -Env overrides: - -- `WHISPER_MODEL`, `WHISPER_DEVICE` -- `WHISPER_FFMPEG_IN` (device index or name) -- `LEL_HOTKEY`, `LEL_INJECTION_BACKEND` -- `LEL_AI_CLEANUP_ENABLED`, `LEL_AI_CLEANUP_MODEL`, `LEL_AI_CLEANUP_TEMPERATURE` -- `LEL_AI_CLEANUP_BASE_URL`, `LEL_AI_CLEANUP_API_KEY` - Recording input can be a device index (preferred) or a substring of the device name. @@ -72,7 +63,7 @@ systemctl --user enable --now lel ## Usage - Press the hotkey once to start recording. -- Press it again to stop and transcribe. +- Press it again to stop and run STT. - The transcript is logged to stderr. Injection backends: diff --git a/pyproject.toml b/pyproject.toml index 76c6345..546c6f0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "lel" version = "0.0.0" -description = "X11 transcription daemon with faster-whisper and optional AI cleanup" +description = "X11 STT daemon with faster-whisper and optional AI cleanup" readme = "README.md" requires-python = ">=3.10" dependencies = [ diff --git a/src/aiprocess.py b/src/aiprocess.py index f784fd5..8e8245d 100644 --- a/src/aiprocess.py +++ b/src/aiprocess.py @@ -149,9 +149,6 @@ def main() -> int: json.dumps(redacted_dict(cfg), indent=2), ) - if not cfg.ai_cleanup.get("enabled", False): - logging.warning("ai_enabled is false; proceeding anyway") - prompt = load_system_prompt("") logging.info("system prompt:\n%s", prompt) diff --git a/src/config.py b/src/config.py index a36b410..1ccda0d 100644 --- a/src/config.py +++ b/src/config.py @@ -1,22 +1,16 @@ import json -import os from dataclasses import dataclass, field from pathlib import Path -def _parse_bool(val: str) -> bool: - return val.strip().lower() in {"1", "true", "yes", "on"} - - @dataclass class Config: daemon: dict = field(default_factory=lambda: {"hotkey": "Cmd+m"}) recording: dict = field(default_factory=lambda: {"input": ""}) - transcription: dict = field(default_factory=lambda: {"model": "base", "device": "cpu"}) + stt: dict = field(default_factory=lambda: {"model": "base", "device": "cpu"}) injection: dict = field(default_factory=lambda: {"backend": "clipboard"}) ai_cleanup: dict = field( default_factory=lambda: { - "enabled": False, "model": "llama3.2:3b", "temperature": 0.0, "base_url": "http://localhost:11434", @@ -36,19 +30,16 @@ def load(path: str | None) -> Config: p = Path(path) if path else default_path() if p.exists(): data = json.loads(p.read_text(encoding="utf-8")) - if any(k in data for k in ("daemon", "recording", "transcription", "transcribing", "injection", "ai_cleanup", "ai")): + if any(k in data for k in ("daemon", "recording", "stt", "injection", "ai_cleanup", "ai")): for k, v in data.items(): if hasattr(cfg, k): setattr(cfg, k, v) - if "transcribing" in data and "transcription" not in data: - cfg.transcription = data.get("transcribing", cfg.transcription) else: cfg.daemon["hotkey"] = data.get("hotkey", cfg.daemon["hotkey"]) - cfg.recording["input"] = data.get("ffmpeg_input", cfg.recording["input"]) - cfg.transcription["model"] = data.get("whisper_model", cfg.transcription["model"]) - cfg.transcription["device"] = data.get("whisper_device", cfg.transcription["device"]) + cfg.recording["input"] = data.get("input", cfg.recording["input"]) + cfg.stt["model"] = data.get("whisper_model", cfg.stt["model"]) + cfg.stt["device"] = data.get("whisper_device", cfg.stt["device"]) cfg.injection["backend"] = data.get("injection_backend", cfg.injection["backend"]) - cfg.ai_cleanup["enabled"] = data.get("ai_enabled", cfg.ai_cleanup["enabled"]) cfg.ai_cleanup["model"] = data.get("ai_model", cfg.ai_cleanup["model"]) cfg.ai_cleanup["temperature"] = data.get("ai_temperature", cfg.ai_cleanup["temperature"]) cfg.ai_cleanup["base_url"] = data.get("ai_base_url", cfg.ai_cleanup["base_url"]) @@ -58,13 +49,12 @@ def load(path: str | None) -> Config: cfg.daemon = {"hotkey": "Cmd+m"} if not isinstance(cfg.recording, dict): cfg.recording = {"input": ""} - if not isinstance(cfg.transcription, dict): - cfg.transcription = {"model": "base", "device": "cpu"} + if not isinstance(cfg.stt, dict): + cfg.stt = {"model": "base", "device": "cpu"} if not isinstance(cfg.injection, dict): cfg.injection = {"backend": "clipboard"} if not isinstance(cfg.ai_cleanup, dict): cfg.ai_cleanup = { - "enabled": False, "model": "llama3.2:3b", "temperature": 0.0, "base_url": "http://localhost:11434", @@ -80,40 +70,6 @@ def load(path: str | None) -> Config: except Exception: pass - # env overrides - if os.getenv("WHISPER_MODEL"): - cfg.transcription["model"] = os.environ["WHISPER_MODEL"] - if os.getenv("WHISPER_DEVICE"): - cfg.transcription["device"] = os.environ["WHISPER_DEVICE"] - if os.getenv("WHISPER_FFMPEG_IN"): - cfg.recording["input"] = os.environ["WHISPER_FFMPEG_IN"] - - if os.getenv("LEL_HOTKEY"): - cfg.daemon["hotkey"] = os.environ["LEL_HOTKEY"] - if os.getenv("LEL_INJECTION_BACKEND"): - cfg.injection["backend"] = os.environ["LEL_INJECTION_BACKEND"] - - if os.getenv("LEL_AI_CLEANUP_ENABLED"): - cfg.ai_cleanup["enabled"] = _parse_bool(os.environ["LEL_AI_CLEANUP_ENABLED"]) - if os.getenv("LEL_AI_CLEANUP_MODEL"): - cfg.ai_cleanup["model"] = os.environ["LEL_AI_CLEANUP_MODEL"] - if os.getenv("LEL_AI_CLEANUP_TEMPERATURE"): - cfg.ai_cleanup["temperature"] = float(os.environ["LEL_AI_CLEANUP_TEMPERATURE"]) - if os.getenv("LEL_AI_CLEANUP_BASE_URL"): - cfg.ai_cleanup["base_url"] = os.environ["LEL_AI_CLEANUP_BASE_URL"] - if os.getenv("LEL_AI_CLEANUP_API_KEY"): - cfg.ai_cleanup["api_key"] = os.environ["LEL_AI_CLEANUP_API_KEY"] - - if os.getenv("LEL_AI_ENABLED"): - cfg.ai_cleanup["enabled"] = _parse_bool(os.environ["LEL_AI_ENABLED"]) - if os.getenv("LEL_AI_MODEL"): - cfg.ai_cleanup["model"] = os.environ["LEL_AI_MODEL"] - if os.getenv("LEL_AI_TEMPERATURE"): - cfg.ai_cleanup["temperature"] = float(os.environ["LEL_AI_TEMPERATURE"]) - if os.getenv("LEL_AI_BASE_URL"): - cfg.ai_cleanup["base_url"] = os.environ["LEL_AI_BASE_URL"] - if os.getenv("LEL_AI_API_KEY"): - cfg.ai_cleanup["api_key"] = os.environ["LEL_AI_API_KEY"] validate(cfg) return cfg diff --git a/src/leld.py b/src/leld.py index 691457e..6f7e40c 100755 --- a/src/leld.py +++ b/src/leld.py @@ -11,7 +11,7 @@ from pathlib import Path from config import Config, load, redacted_dict from recorder import start_recording, stop_recording -from transcription import FasterWhisperTranscriber, TranscriptionConfig +from stt import FasterWhisperSTT, STTConfig from aiprocess import AIConfig, build_processor from inject import inject from x11_hotkey import listen @@ -21,7 +21,7 @@ from tray import run_tray class State: IDLE = "idle" RECORDING = "recording" - TRANSCRIBING = "transcribing" + STT = "stt" PROCESSING = "processing" OUTPUTTING = "outputting" @@ -34,11 +34,11 @@ class Daemon: self.proc = None self.record = None self.timer = None - self.transcriber = FasterWhisperTranscriber( - TranscriptionConfig( - model=cfg.transcription.get("model", "base"), + self.stt = FasterWhisperSTT( + STTConfig( + model=cfg.stt.get("model", "base"), language=None, - device=cfg.transcription.get("device", "cpu"), + device=cfg.stt.get("device", "cpu"), vad_filter=True, ) ) @@ -61,7 +61,7 @@ class Daemon: self._start_recording_locked() return if self.state == State.RECORDING: - self.state = State.TRANSCRIBING + self.state = State.STT threading.Thread(target=self._stop_and_process, daemon=True).start() return logging.info("busy (%s), trigger ignored", self.state) @@ -86,7 +86,7 @@ class Daemon: with self.lock: if self.state != State.RECORDING: return - self.state = State.TRANSCRIBING + self.state = State.STT threading.Thread(target=self._stop_and_process, daemon=True).start() def _stop_and_process(self): @@ -116,11 +116,11 @@ class Daemon: return try: - self.set_state(State.TRANSCRIBING) - logging.info("transcription started") - text = self.transcriber.transcribe(record.wav_path, language="en") + self.set_state(State.STT) + logging.info("stt started") + text = self.stt.transcribe(record.wav_path, language="en") except Exception as exc: - logging.error("transcription failed: %s", exc) + logging.error("stt failed: %s", exc) self.set_state(State.IDLE) return @@ -129,21 +129,21 @@ class Daemon: self.set_state(State.IDLE) return - logging.info("transcription: %s", text) + logging.info("stt: %s", text) - ai_enabled = self.cfg.ai_cleanup.get("enabled", False) ai_prompt_file = "" - - if ai_enabled: + ai_model = (self.cfg.ai_cleanup.get("model") or "").strip() + ai_base_url = (self.cfg.ai_cleanup.get("base_url") or "").strip() + if ai_model and ai_base_url: self.set_state(State.PROCESSING) logging.info("ai processing started") try: processor = build_processor( AIConfig( - model=self.cfg.ai_cleanup.get("model", ""), + model=ai_model, temperature=self.cfg.ai_cleanup.get("temperature", 0.0), system_prompt_file=ai_prompt_file, - base_url=self.cfg.ai_cleanup.get("base_url", ""), + base_url=ai_base_url, api_key=self.cfg.ai_cleanup.get("api_key", ""), timeout_sec=25, language_hint="en", @@ -171,7 +171,7 @@ class Daemon: with self.lock: if self.state != State.RECORDING: return - self.state = State.TRANSCRIBING + self.state = State.STT threading.Thread(target=self._stop_and_process, daemon=True).start() diff --git a/src/transcription.py b/src/stt.py similarity index 87% rename from src/transcription.py rename to src/stt.py index 111d88c..c6bcb18 100644 --- a/src/transcription.py +++ b/src/stt.py @@ -6,7 +6,7 @@ from faster_whisper import WhisperModel @dataclass -class TranscriptionConfig: +class STTConfig: model: str language: str | None device: str @@ -20,8 +20,8 @@ def _compute_type(device: str) -> str: return "int8" -class FasterWhisperTranscriber: - def __init__(self, cfg: TranscriptionConfig): +class FasterWhisperSTT: + def __init__(self, cfg: STTConfig): self.cfg = cfg self._model: WhisperModel | None = None @@ -35,7 +35,7 @@ class FasterWhisperTranscriber: def transcribe(self, wav_path: str, language: str | None = None) -> str: self._load() - segments, _info = self._model.transcribe( + segments, _info = self._model.transcribe( # type: ignore[union-attr] wav_path, language=language or self.cfg.language, vad_filter=self.cfg.vad_filter, diff --git a/src/tray.py b/src/tray.py index 9e6e041..c7007ad 100644 --- a/src/tray.py +++ b/src/tray.py @@ -28,7 +28,7 @@ class Tray: def _icon_path(self, state: str) -> str: if state == "recording": return str(self.base / "recording.png") - if state == "transcribing": + if state == "stt": return str(self.base / "transcribing.png") if state == "processing": return str(self.base / "processing.png") @@ -37,8 +37,8 @@ class Tray: def _title(self, state: str) -> str: if state == "recording": return "Recording" - if state == "transcribing": - return "Transcribing" + if state == "stt": + return "STT" if state == "processing": return "AI Processing" return "Idle" diff --git a/systemd/lel.service b/systemd/lel.service index d466434..8a1d407 100644 --- a/systemd/lel.service +++ b/systemd/lel.service @@ -1,5 +1,5 @@ [Unit] -Description=lel X11 transcription daemon +Description=lel X11 STT daemon After=default.target [Service]