Clean up config and STT naming

This commit is contained in:
Thales Maciel 2026-02-24 11:15:48 -03:00
parent b74aaaa1c4
commit 8c68719041
No known key found for this signature in database
GPG key ID: 33112E6833C34679
9 changed files with 42 additions and 98 deletions

View file

@ -2,9 +2,9 @@
## Project Structure & Module Organization ## Project Structure & Module Organization
- `src/leld.py` is the primary entrypoint (X11 transcription daemon). - `src/leld.py` is the primary entrypoint (X11 STT daemon).
- `src/recorder.py` handles audio capture using PortAudio via `sounddevice`. - `src/recorder.py` handles audio capture using PortAudio via `sounddevice`.
- `src/transcription.py` wraps faster-whisper for transcription. - `src/stt.py` wraps faster-whisper for STT.
## Build, Test, and Development Commands ## Build, Test, and Development Commands
@ -29,5 +29,5 @@
## Configuration Tips ## Configuration Tips
- Audio input is controlled via `WHISPER_FFMPEG_IN` (device index or name). - Audio input is configured via the `recording.input` field in `config.json`.
- Model, language, device, and extra args can be set with `WHISPER_MODEL`, `WHISPER_LANG`, `WHISPER_DEVICE`, and `WHISPER_EXTRA_ARGS`. - STT model and device are configured via the `stt` section in `config.json`.

View file

@ -1,6 +1,6 @@
# lel # lel
Python X11 transcription daemon that records audio, runs Whisper, logs the transcript, and can optionally run AI post-processing before injecting text. Python X11 STT daemon that records audio, runs Whisper, logs the transcript, and can optionally run AI post-processing before injecting text.
## Requirements ## Requirements
@ -35,11 +35,10 @@ Create `~/.config/lel/config.json`:
{ {
"daemon": { "hotkey": "Cmd+m" }, "daemon": { "hotkey": "Cmd+m" },
"recording": { "input": "0" }, "recording": { "input": "0" },
"transcription": { "model": "base", "device": "cpu" }, "stt": { "model": "base", "device": "cpu" },
"injection": { "backend": "clipboard" }, "injection": { "backend": "clipboard" },
"ai_cleanup": { "ai_cleanup": {
"enabled": true,
"model": "llama3.2:3b", "model": "llama3.2:3b",
"temperature": 0.0, "temperature": 0.0,
"base_url": "http://localhost:11434", "base_url": "http://localhost:11434",
@ -48,14 +47,6 @@ Create `~/.config/lel/config.json`:
} }
``` ```
Env overrides:
- `WHISPER_MODEL`, `WHISPER_DEVICE`
- `WHISPER_FFMPEG_IN` (device index or name)
- `LEL_HOTKEY`, `LEL_INJECTION_BACKEND`
- `LEL_AI_CLEANUP_ENABLED`, `LEL_AI_CLEANUP_MODEL`, `LEL_AI_CLEANUP_TEMPERATURE`
- `LEL_AI_CLEANUP_BASE_URL`, `LEL_AI_CLEANUP_API_KEY`
Recording input can be a device index (preferred) or a substring of the device Recording input can be a device index (preferred) or a substring of the device
name. name.
@ -72,7 +63,7 @@ systemctl --user enable --now lel
## Usage ## Usage
- Press the hotkey once to start recording. - Press the hotkey once to start recording.
- Press it again to stop and transcribe. - Press it again to stop and run STT.
- The transcript is logged to stderr. - The transcript is logged to stderr.
Injection backends: Injection backends:

View file

@ -1,7 +1,7 @@
[project] [project]
name = "lel" name = "lel"
version = "0.0.0" version = "0.0.0"
description = "X11 transcription daemon with faster-whisper and optional AI cleanup" description = "X11 STT daemon with faster-whisper and optional AI cleanup"
readme = "README.md" readme = "README.md"
requires-python = ">=3.10" requires-python = ">=3.10"
dependencies = [ dependencies = [

View file

@ -149,9 +149,6 @@ def main() -> int:
json.dumps(redacted_dict(cfg), indent=2), json.dumps(redacted_dict(cfg), indent=2),
) )
if not cfg.ai_cleanup.get("enabled", False):
logging.warning("ai_enabled is false; proceeding anyway")
prompt = load_system_prompt("") prompt = load_system_prompt("")
logging.info("system prompt:\n%s", prompt) logging.info("system prompt:\n%s", prompt)

View file

@ -1,22 +1,16 @@
import json import json
import os
from dataclasses import dataclass, field from dataclasses import dataclass, field
from pathlib import Path from pathlib import Path
def _parse_bool(val: str) -> bool:
return val.strip().lower() in {"1", "true", "yes", "on"}
@dataclass @dataclass
class Config: class Config:
daemon: dict = field(default_factory=lambda: {"hotkey": "Cmd+m"}) daemon: dict = field(default_factory=lambda: {"hotkey": "Cmd+m"})
recording: dict = field(default_factory=lambda: {"input": ""}) recording: dict = field(default_factory=lambda: {"input": ""})
transcription: dict = field(default_factory=lambda: {"model": "base", "device": "cpu"}) stt: dict = field(default_factory=lambda: {"model": "base", "device": "cpu"})
injection: dict = field(default_factory=lambda: {"backend": "clipboard"}) injection: dict = field(default_factory=lambda: {"backend": "clipboard"})
ai_cleanup: dict = field( ai_cleanup: dict = field(
default_factory=lambda: { default_factory=lambda: {
"enabled": False,
"model": "llama3.2:3b", "model": "llama3.2:3b",
"temperature": 0.0, "temperature": 0.0,
"base_url": "http://localhost:11434", "base_url": "http://localhost:11434",
@ -36,19 +30,16 @@ def load(path: str | None) -> Config:
p = Path(path) if path else default_path() p = Path(path) if path else default_path()
if p.exists(): if p.exists():
data = json.loads(p.read_text(encoding="utf-8")) data = json.loads(p.read_text(encoding="utf-8"))
if any(k in data for k in ("daemon", "recording", "transcription", "transcribing", "injection", "ai_cleanup", "ai")): if any(k in data for k in ("daemon", "recording", "stt", "injection", "ai_cleanup", "ai")):
for k, v in data.items(): for k, v in data.items():
if hasattr(cfg, k): if hasattr(cfg, k):
setattr(cfg, k, v) setattr(cfg, k, v)
if "transcribing" in data and "transcription" not in data:
cfg.transcription = data.get("transcribing", cfg.transcription)
else: else:
cfg.daemon["hotkey"] = data.get("hotkey", cfg.daemon["hotkey"]) cfg.daemon["hotkey"] = data.get("hotkey", cfg.daemon["hotkey"])
cfg.recording["input"] = data.get("ffmpeg_input", cfg.recording["input"]) cfg.recording["input"] = data.get("input", cfg.recording["input"])
cfg.transcription["model"] = data.get("whisper_model", cfg.transcription["model"]) cfg.stt["model"] = data.get("whisper_model", cfg.stt["model"])
cfg.transcription["device"] = data.get("whisper_device", cfg.transcription["device"]) cfg.stt["device"] = data.get("whisper_device", cfg.stt["device"])
cfg.injection["backend"] = data.get("injection_backend", cfg.injection["backend"]) cfg.injection["backend"] = data.get("injection_backend", cfg.injection["backend"])
cfg.ai_cleanup["enabled"] = data.get("ai_enabled", cfg.ai_cleanup["enabled"])
cfg.ai_cleanup["model"] = data.get("ai_model", cfg.ai_cleanup["model"]) cfg.ai_cleanup["model"] = data.get("ai_model", cfg.ai_cleanup["model"])
cfg.ai_cleanup["temperature"] = data.get("ai_temperature", cfg.ai_cleanup["temperature"]) cfg.ai_cleanup["temperature"] = data.get("ai_temperature", cfg.ai_cleanup["temperature"])
cfg.ai_cleanup["base_url"] = data.get("ai_base_url", cfg.ai_cleanup["base_url"]) cfg.ai_cleanup["base_url"] = data.get("ai_base_url", cfg.ai_cleanup["base_url"])
@ -58,13 +49,12 @@ def load(path: str | None) -> Config:
cfg.daemon = {"hotkey": "Cmd+m"} cfg.daemon = {"hotkey": "Cmd+m"}
if not isinstance(cfg.recording, dict): if not isinstance(cfg.recording, dict):
cfg.recording = {"input": ""} cfg.recording = {"input": ""}
if not isinstance(cfg.transcription, dict): if not isinstance(cfg.stt, dict):
cfg.transcription = {"model": "base", "device": "cpu"} cfg.stt = {"model": "base", "device": "cpu"}
if not isinstance(cfg.injection, dict): if not isinstance(cfg.injection, dict):
cfg.injection = {"backend": "clipboard"} cfg.injection = {"backend": "clipboard"}
if not isinstance(cfg.ai_cleanup, dict): if not isinstance(cfg.ai_cleanup, dict):
cfg.ai_cleanup = { cfg.ai_cleanup = {
"enabled": False,
"model": "llama3.2:3b", "model": "llama3.2:3b",
"temperature": 0.0, "temperature": 0.0,
"base_url": "http://localhost:11434", "base_url": "http://localhost:11434",
@ -80,40 +70,6 @@ def load(path: str | None) -> Config:
except Exception: except Exception:
pass pass
# env overrides
if os.getenv("WHISPER_MODEL"):
cfg.transcription["model"] = os.environ["WHISPER_MODEL"]
if os.getenv("WHISPER_DEVICE"):
cfg.transcription["device"] = os.environ["WHISPER_DEVICE"]
if os.getenv("WHISPER_FFMPEG_IN"):
cfg.recording["input"] = os.environ["WHISPER_FFMPEG_IN"]
if os.getenv("LEL_HOTKEY"):
cfg.daemon["hotkey"] = os.environ["LEL_HOTKEY"]
if os.getenv("LEL_INJECTION_BACKEND"):
cfg.injection["backend"] = os.environ["LEL_INJECTION_BACKEND"]
if os.getenv("LEL_AI_CLEANUP_ENABLED"):
cfg.ai_cleanup["enabled"] = _parse_bool(os.environ["LEL_AI_CLEANUP_ENABLED"])
if os.getenv("LEL_AI_CLEANUP_MODEL"):
cfg.ai_cleanup["model"] = os.environ["LEL_AI_CLEANUP_MODEL"]
if os.getenv("LEL_AI_CLEANUP_TEMPERATURE"):
cfg.ai_cleanup["temperature"] = float(os.environ["LEL_AI_CLEANUP_TEMPERATURE"])
if os.getenv("LEL_AI_CLEANUP_BASE_URL"):
cfg.ai_cleanup["base_url"] = os.environ["LEL_AI_CLEANUP_BASE_URL"]
if os.getenv("LEL_AI_CLEANUP_API_KEY"):
cfg.ai_cleanup["api_key"] = os.environ["LEL_AI_CLEANUP_API_KEY"]
if os.getenv("LEL_AI_ENABLED"):
cfg.ai_cleanup["enabled"] = _parse_bool(os.environ["LEL_AI_ENABLED"])
if os.getenv("LEL_AI_MODEL"):
cfg.ai_cleanup["model"] = os.environ["LEL_AI_MODEL"]
if os.getenv("LEL_AI_TEMPERATURE"):
cfg.ai_cleanup["temperature"] = float(os.environ["LEL_AI_TEMPERATURE"])
if os.getenv("LEL_AI_BASE_URL"):
cfg.ai_cleanup["base_url"] = os.environ["LEL_AI_BASE_URL"]
if os.getenv("LEL_AI_API_KEY"):
cfg.ai_cleanup["api_key"] = os.environ["LEL_AI_API_KEY"]
validate(cfg) validate(cfg)
return cfg return cfg

View file

@ -11,7 +11,7 @@ from pathlib import Path
from config import Config, load, redacted_dict from config import Config, load, redacted_dict
from recorder import start_recording, stop_recording from recorder import start_recording, stop_recording
from transcription import FasterWhisperTranscriber, TranscriptionConfig from stt import FasterWhisperSTT, STTConfig
from aiprocess import AIConfig, build_processor from aiprocess import AIConfig, build_processor
from inject import inject from inject import inject
from x11_hotkey import listen from x11_hotkey import listen
@ -21,7 +21,7 @@ from tray import run_tray
class State: class State:
IDLE = "idle" IDLE = "idle"
RECORDING = "recording" RECORDING = "recording"
TRANSCRIBING = "transcribing" STT = "stt"
PROCESSING = "processing" PROCESSING = "processing"
OUTPUTTING = "outputting" OUTPUTTING = "outputting"
@ -34,11 +34,11 @@ class Daemon:
self.proc = None self.proc = None
self.record = None self.record = None
self.timer = None self.timer = None
self.transcriber = FasterWhisperTranscriber( self.stt = FasterWhisperSTT(
TranscriptionConfig( STTConfig(
model=cfg.transcription.get("model", "base"), model=cfg.stt.get("model", "base"),
language=None, language=None,
device=cfg.transcription.get("device", "cpu"), device=cfg.stt.get("device", "cpu"),
vad_filter=True, vad_filter=True,
) )
) )
@ -61,7 +61,7 @@ class Daemon:
self._start_recording_locked() self._start_recording_locked()
return return
if self.state == State.RECORDING: if self.state == State.RECORDING:
self.state = State.TRANSCRIBING self.state = State.STT
threading.Thread(target=self._stop_and_process, daemon=True).start() threading.Thread(target=self._stop_and_process, daemon=True).start()
return return
logging.info("busy (%s), trigger ignored", self.state) logging.info("busy (%s), trigger ignored", self.state)
@ -86,7 +86,7 @@ class Daemon:
with self.lock: with self.lock:
if self.state != State.RECORDING: if self.state != State.RECORDING:
return return
self.state = State.TRANSCRIBING self.state = State.STT
threading.Thread(target=self._stop_and_process, daemon=True).start() threading.Thread(target=self._stop_and_process, daemon=True).start()
def _stop_and_process(self): def _stop_and_process(self):
@ -116,11 +116,11 @@ class Daemon:
return return
try: try:
self.set_state(State.TRANSCRIBING) self.set_state(State.STT)
logging.info("transcription started") logging.info("stt started")
text = self.transcriber.transcribe(record.wav_path, language="en") text = self.stt.transcribe(record.wav_path, language="en")
except Exception as exc: except Exception as exc:
logging.error("transcription failed: %s", exc) logging.error("stt failed: %s", exc)
self.set_state(State.IDLE) self.set_state(State.IDLE)
return return
@ -129,21 +129,21 @@ class Daemon:
self.set_state(State.IDLE) self.set_state(State.IDLE)
return return
logging.info("transcription: %s", text) logging.info("stt: %s", text)
ai_enabled = self.cfg.ai_cleanup.get("enabled", False)
ai_prompt_file = "" ai_prompt_file = ""
ai_model = (self.cfg.ai_cleanup.get("model") or "").strip()
if ai_enabled: ai_base_url = (self.cfg.ai_cleanup.get("base_url") or "").strip()
if ai_model and ai_base_url:
self.set_state(State.PROCESSING) self.set_state(State.PROCESSING)
logging.info("ai processing started") logging.info("ai processing started")
try: try:
processor = build_processor( processor = build_processor(
AIConfig( AIConfig(
model=self.cfg.ai_cleanup.get("model", ""), model=ai_model,
temperature=self.cfg.ai_cleanup.get("temperature", 0.0), temperature=self.cfg.ai_cleanup.get("temperature", 0.0),
system_prompt_file=ai_prompt_file, system_prompt_file=ai_prompt_file,
base_url=self.cfg.ai_cleanup.get("base_url", ""), base_url=ai_base_url,
api_key=self.cfg.ai_cleanup.get("api_key", ""), api_key=self.cfg.ai_cleanup.get("api_key", ""),
timeout_sec=25, timeout_sec=25,
language_hint="en", language_hint="en",
@ -171,7 +171,7 @@ class Daemon:
with self.lock: with self.lock:
if self.state != State.RECORDING: if self.state != State.RECORDING:
return return
self.state = State.TRANSCRIBING self.state = State.STT
threading.Thread(target=self._stop_and_process, daemon=True).start() threading.Thread(target=self._stop_and_process, daemon=True).start()

View file

@ -6,7 +6,7 @@ from faster_whisper import WhisperModel
@dataclass @dataclass
class TranscriptionConfig: class STTConfig:
model: str model: str
language: str | None language: str | None
device: str device: str
@ -20,8 +20,8 @@ def _compute_type(device: str) -> str:
return "int8" return "int8"
class FasterWhisperTranscriber: class FasterWhisperSTT:
def __init__(self, cfg: TranscriptionConfig): def __init__(self, cfg: STTConfig):
self.cfg = cfg self.cfg = cfg
self._model: WhisperModel | None = None self._model: WhisperModel | None = None
@ -35,7 +35,7 @@ class FasterWhisperTranscriber:
def transcribe(self, wav_path: str, language: str | None = None) -> str: def transcribe(self, wav_path: str, language: str | None = None) -> str:
self._load() self._load()
segments, _info = self._model.transcribe( segments, _info = self._model.transcribe( # type: ignore[union-attr]
wav_path, wav_path,
language=language or self.cfg.language, language=language or self.cfg.language,
vad_filter=self.cfg.vad_filter, vad_filter=self.cfg.vad_filter,

View file

@ -28,7 +28,7 @@ class Tray:
def _icon_path(self, state: str) -> str: def _icon_path(self, state: str) -> str:
if state == "recording": if state == "recording":
return str(self.base / "recording.png") return str(self.base / "recording.png")
if state == "transcribing": if state == "stt":
return str(self.base / "transcribing.png") return str(self.base / "transcribing.png")
if state == "processing": if state == "processing":
return str(self.base / "processing.png") return str(self.base / "processing.png")
@ -37,8 +37,8 @@ class Tray:
def _title(self, state: str) -> str: def _title(self, state: str) -> str:
if state == "recording": if state == "recording":
return "Recording" return "Recording"
if state == "transcribing": if state == "stt":
return "Transcribing" return "STT"
if state == "processing": if state == "processing":
return "AI Processing" return "AI Processing"
return "Idle" return "Idle"

View file

@ -1,5 +1,5 @@
[Unit] [Unit]
Description=lel X11 transcription daemon Description=lel X11 STT daemon
After=default.target After=default.target
[Service] [Service]