Inline STT and tray

This commit is contained in:
Thales Maciel 2026-02-24 11:27:22 -03:00
parent 8c68719041
commit 4e8edc3e40
No known key found for this signature in database
GPG key ID: 33112E6833C34679
8 changed files with 109 additions and 171 deletions

View file

@ -4,7 +4,7 @@
- `src/leld.py` is the primary entrypoint (X11 STT daemon). - `src/leld.py` is the primary entrypoint (X11 STT daemon).
- `src/recorder.py` handles audio capture using PortAudio via `sounddevice`. - `src/recorder.py` handles audio capture using PortAudio via `sounddevice`.
- `src/stt.py` wraps faster-whisper for STT. - `src/leld.py` owns Whisper setup and transcription.
## Build, Test, and Development Commands ## Build, Test, and Development Commands

View file

@ -40,7 +40,6 @@ Create `~/.config/lel/config.json`:
"ai_cleanup": { "ai_cleanup": {
"model": "llama3.2:3b", "model": "llama3.2:3b",
"temperature": 0.0,
"base_url": "http://localhost:11434", "base_url": "http://localhost:11434",
"api_key": "" "api_key": ""
} }

View file

@ -5,21 +5,27 @@ import json
import logging import logging
import sys import sys
import urllib.request import urllib.request
from dataclasses import dataclass
from pathlib import Path from pathlib import Path
from dataclasses import dataclass
def load_system_prompt(path: str | None) -> str: SYSTEM_PROMPT = (
if path: "You are an amanuensis. Rewrite the user's dictated text into clean, grammatical prose.\n\n"
return Path(path).read_text(encoding="utf-8").strip() "Rules:\n"
return (Path(__file__).parent / "system_prompt.txt").read_text(encoding="utf-8").strip() "- Remove filler words (um/uh/like), false starts, and self-corrections.\n"
"- Keep meaning, facts, and intent.\n"
"- Prefer concise sentences.\n"
"- Do not add new info.\n"
"- Output ONLY the cleaned text, no commentary.\n\n"
"Examples:\n"
" - \"schedule that for 5 PM, I mean 4 PM\" -> \"schedule that for 4 PM\"\n"
" - \"let's ask Bob, I mean Janice, let's ask Janice\" -> \"let's ask Janice\"\n"
)
@dataclass @dataclass
class AIConfig: class AIConfig:
model: str model: str
temperature: float
system_prompt_file: str
base_url: str base_url: str
api_key: str api_key: str
timeout_sec: int timeout_sec: int
@ -30,7 +36,7 @@ class AIConfig:
class GenericAPIProcessor: class GenericAPIProcessor:
def __init__(self, cfg: AIConfig): def __init__(self, cfg: AIConfig):
self.cfg = cfg self.cfg = cfg
self.system = load_system_prompt(cfg.system_prompt_file) self.system = SYSTEM_PROMPT
def process(self, text: str) -> str: def process(self, text: str) -> str:
language = self.cfg.language_hint or "" language = self.cfg.language_hint or ""
@ -46,7 +52,7 @@ class GenericAPIProcessor:
{"role": "system", "content": self.system}, {"role": "system", "content": self.system},
{"role": "user", "content": user_content}, {"role": "user", "content": user_content},
], ],
"temperature": self.cfg.temperature, "temperature": 0.0,
} }
data = json.dumps(payload).encode("utf-8") data = json.dumps(payload).encode("utf-8")
url = _chat_completions_url(self.cfg.base_url) url = _chat_completions_url(self.cfg.base_url)
@ -101,6 +107,10 @@ def list_models(base_url: str, api_key: str = "", timeout_sec: int = 10) -> list
return [] return []
def load_system_prompt(_path: str | None = None) -> str:
return SYSTEM_PROMPT
def _models_url(base_url: str) -> str: def _models_url(base_url: str) -> str:
root = _root_url(base_url) root = _root_url(base_url)
return root.rstrip("/") + "/v1/models" return root.rstrip("/") + "/v1/models"
@ -149,14 +159,11 @@ def main() -> int:
json.dumps(redacted_dict(cfg), indent=2), json.dumps(redacted_dict(cfg), indent=2),
) )
prompt = load_system_prompt("") logging.info("system prompt:\n%s", SYSTEM_PROMPT)
logging.info("system prompt:\n%s", prompt)
processor = build_processor( processor = build_processor(
AIConfig( AIConfig(
model=cfg.ai_cleanup.get("model", ""), model=cfg.ai_cleanup.get("model", ""),
temperature=cfg.ai_cleanup.get("temperature", 0.0),
system_prompt_file="",
base_url=cfg.ai_cleanup.get("base_url", ""), base_url=cfg.ai_cleanup.get("base_url", ""),
api_key=cfg.ai_cleanup.get("api_key", ""), api_key=cfg.ai_cleanup.get("api_key", ""),
timeout_sec=25, timeout_sec=25,

View file

@ -12,7 +12,6 @@ class Config:
ai_cleanup: dict = field( ai_cleanup: dict = field(
default_factory=lambda: { default_factory=lambda: {
"model": "llama3.2:3b", "model": "llama3.2:3b",
"temperature": 0.0,
"base_url": "http://localhost:11434", "base_url": "http://localhost:11434",
"api_key": "", "api_key": "",
} }
@ -30,7 +29,7 @@ def load(path: str | None) -> Config:
p = Path(path) if path else default_path() p = Path(path) if path else default_path()
if p.exists(): if p.exists():
data = json.loads(p.read_text(encoding="utf-8")) data = json.loads(p.read_text(encoding="utf-8"))
if any(k in data for k in ("daemon", "recording", "stt", "injection", "ai_cleanup", "ai")): if any(k in data for k in ("daemon", "recording", "stt", "injection", "ai_cleanup")):
for k, v in data.items(): for k, v in data.items():
if hasattr(cfg, k): if hasattr(cfg, k):
setattr(cfg, k, v) setattr(cfg, k, v)
@ -41,7 +40,6 @@ def load(path: str | None) -> Config:
cfg.stt["device"] = data.get("whisper_device", cfg.stt["device"]) cfg.stt["device"] = data.get("whisper_device", cfg.stt["device"])
cfg.injection["backend"] = data.get("injection_backend", cfg.injection["backend"]) cfg.injection["backend"] = data.get("injection_backend", cfg.injection["backend"])
cfg.ai_cleanup["model"] = data.get("ai_model", cfg.ai_cleanup["model"]) cfg.ai_cleanup["model"] = data.get("ai_model", cfg.ai_cleanup["model"])
cfg.ai_cleanup["temperature"] = data.get("ai_temperature", cfg.ai_cleanup["temperature"])
cfg.ai_cleanup["base_url"] = data.get("ai_base_url", cfg.ai_cleanup["base_url"]) cfg.ai_cleanup["base_url"] = data.get("ai_base_url", cfg.ai_cleanup["base_url"])
cfg.ai_cleanup["api_key"] = data.get("ai_api_key", cfg.ai_cleanup["api_key"]) cfg.ai_cleanup["api_key"] = data.get("ai_api_key", cfg.ai_cleanup["api_key"])
@ -56,20 +54,9 @@ def load(path: str | None) -> Config:
if not isinstance(cfg.ai_cleanup, dict): if not isinstance(cfg.ai_cleanup, dict):
cfg.ai_cleanup = { cfg.ai_cleanup = {
"model": "llama3.2:3b", "model": "llama3.2:3b",
"temperature": 0.0,
"base_url": "http://localhost:11434", "base_url": "http://localhost:11434",
"api_key": "", "api_key": "",
} }
legacy_ai = getattr(cfg, "ai", None)
if isinstance(legacy_ai, dict) and not cfg.ai_cleanup:
cfg.ai_cleanup = legacy_ai
try:
delattr(cfg, "ai")
except AttributeError:
pass
except Exception:
pass
validate(cfg) validate(cfg)
return cfg return cfg

View file

@ -9,13 +9,18 @@ import threading
import time import time
from pathlib import Path from pathlib import Path
import gi
from faster_whisper import WhisperModel
from config import Config, load, redacted_dict from config import Config, load, redacted_dict
from recorder import start_recording, stop_recording from recorder import start_recording, stop_recording
from stt import FasterWhisperSTT, STTConfig
from aiprocess import AIConfig, build_processor from aiprocess import AIConfig, build_processor
from inject import inject from inject import inject
from x11_hotkey import listen from x11_hotkey import listen
from tray import run_tray
gi.require_version("Gtk", "3.0")
from gi.repository import GLib, Gtk # type: ignore[import-not-found]
class State: class State:
@ -26,6 +31,13 @@ class State:
OUTPUTTING = "outputting" OUTPUTTING = "outputting"
def _compute_type(device: str) -> str:
dev = (device or "cpu").lower()
if dev == "cuda":
return "float16"
return "int8"
class Daemon: class Daemon:
def __init__(self, cfg: Config): def __init__(self, cfg: Config):
self.cfg = cfg self.cfg = cfg
@ -34,15 +46,12 @@ class Daemon:
self.proc = None self.proc = None
self.record = None self.record = None
self.timer = None self.timer = None
self.stt = FasterWhisperSTT( self.model = WhisperModel(
STTConfig( cfg.stt.get("model", "base"),
model=cfg.stt.get("model", "base"),
language=None,
device=cfg.stt.get("device", "cpu"), device=cfg.stt.get("device", "cpu"),
vad_filter=True, compute_type=_compute_type(cfg.stt.get("device", "cpu")),
) )
) self.tray = _Tray(self.get_state, self._quit)
self.ai = None
def set_state(self, state: str): def set_state(self, state: str):
with self.lock: with self.lock:
@ -55,6 +64,9 @@ class Daemon:
with self.lock: with self.lock:
return self.state return self.state
def _quit(self):
os._exit(0)
def toggle(self): def toggle(self):
with self.lock: with self.lock:
if self.state == State.IDLE: if self.state == State.IDLE:
@ -118,7 +130,7 @@ class Daemon:
try: try:
self.set_state(State.STT) self.set_state(State.STT)
logging.info("stt started") logging.info("stt started")
text = self.stt.transcribe(record.wav_path, language="en") text = self._transcribe(record.wav_path)
except Exception as exc: except Exception as exc:
logging.error("stt failed: %s", exc) logging.error("stt failed: %s", exc)
self.set_state(State.IDLE) self.set_state(State.IDLE)
@ -131,7 +143,6 @@ class Daemon:
logging.info("stt: %s", text) logging.info("stt: %s", text)
ai_prompt_file = ""
ai_model = (self.cfg.ai_cleanup.get("model") or "").strip() ai_model = (self.cfg.ai_cleanup.get("model") or "").strip()
ai_base_url = (self.cfg.ai_cleanup.get("base_url") or "").strip() ai_base_url = (self.cfg.ai_cleanup.get("base_url") or "").strip()
if ai_model and ai_base_url: if ai_model and ai_base_url:
@ -141,8 +152,6 @@ class Daemon:
processor = build_processor( processor = build_processor(
AIConfig( AIConfig(
model=ai_model, model=ai_model,
temperature=self.cfg.ai_cleanup.get("temperature", 0.0),
system_prompt_file=ai_prompt_file,
base_url=ai_base_url, base_url=ai_base_url,
api_key=self.cfg.ai_cleanup.get("api_key", ""), api_key=self.cfg.ai_cleanup.get("api_key", ""),
timeout_sec=25, timeout_sec=25,
@ -174,6 +183,69 @@ class Daemon:
self.state = State.STT self.state = State.STT
threading.Thread(target=self._stop_and_process, daemon=True).start() threading.Thread(target=self._stop_and_process, daemon=True).start()
def _transcribe(self, wav_path: str) -> str:
segments, _info = self.model.transcribe(
wav_path,
language=None,
vad_filter=True,
)
parts = []
for seg in segments:
text = (seg.text or "").strip()
if text:
parts.append(text)
return " ".join(parts).strip()
def run_tray(self):
self.tray.run()
class _Tray:
def __init__(self, state_getter, on_quit):
self.state_getter = state_getter
self.on_quit = on_quit
self.base = Path(__file__).parent / "assets"
self.icon = Gtk.StatusIcon()
self.icon.set_visible(True)
self.icon.connect("popup-menu", self._on_menu)
self.menu = Gtk.Menu()
quit_item = Gtk.MenuItem(label="Quit")
quit_item.connect("activate", lambda *_: self.on_quit())
self.menu.append(quit_item)
self.menu.show_all()
def _on_menu(self, _icon, _button, _time):
self.menu.popup(None, None, None, None, 0, _time)
def _icon_path(self, state: str) -> str:
if state == State.RECORDING:
return str(self.base / "recording.png")
if state == State.STT:
return str(self.base / "transcribing.png")
if state == State.PROCESSING:
return str(self.base / "processing.png")
return str(self.base / "idle.png")
def _title(self, state: str) -> str:
if state == State.RECORDING:
return "Recording"
if state == State.STT:
return "STT"
if state == State.PROCESSING:
return "AI Processing"
return "Idle"
def update(self):
state = self.state_getter()
self.icon.set_from_file(self._icon_path(state))
self.icon.set_tooltip_text(self._title(state))
return True
def run(self):
self.update()
GLib.timeout_add(250, self.update)
Gtk.main()
def _lock_single_instance(): def _lock_single_instance():
runtime_dir = Path(os.getenv("XDG_RUNTIME_DIR", "/tmp")) / "lel" runtime_dir = Path(os.getenv("XDG_RUNTIME_DIR", "/tmp")) / "lel"
@ -199,8 +271,6 @@ def main():
logging.basicConfig(stream=sys.stderr, level=logging.INFO, format="leld: %(asctime)s %(message)s") logging.basicConfig(stream=sys.stderr, level=logging.INFO, format="leld: %(asctime)s %(message)s")
cfg = load(args.config) cfg = load(args.config)
config_path = Path(args.config) if args.config else Path.home() / ".config" / "lel" / "config.json"
_lock_single_instance() _lock_single_instance()
logging.info("ready (hotkey: %s)", cfg.daemon.get("hotkey", "")) logging.info("ready (hotkey: %s)", cfg.daemon.get("hotkey", ""))
@ -208,9 +278,6 @@ def main():
daemon = Daemon(cfg) daemon = Daemon(cfg)
def on_quit():
os._exit(0)
def handle_signal(_sig, _frame): def handle_signal(_sig, _frame):
logging.info("signal received, shutting down") logging.info("signal received, shutting down")
daemon.stop_recording() daemon.stop_recording()
@ -236,7 +303,7 @@ def main():
), ),
daemon=True, daemon=True,
).start() ).start()
run_tray(daemon.get_state, on_quit, None) daemon.run_tray()
if __name__ == "__main__": if __name__ == "__main__":

View file

@ -1,48 +0,0 @@
from __future__ import annotations
from dataclasses import dataclass
from faster_whisper import WhisperModel
@dataclass
class STTConfig:
model: str
language: str | None
device: str
vad_filter: bool
def _compute_type(device: str) -> str:
dev = (device or "cpu").lower()
if dev == "cuda":
return "float16"
return "int8"
class FasterWhisperSTT:
def __init__(self, cfg: STTConfig):
self.cfg = cfg
self._model: WhisperModel | None = None
def _load(self):
if self._model is None:
self._model = WhisperModel(
self.cfg.model,
device=self.cfg.device or "cpu",
compute_type=_compute_type(self.cfg.device),
)
def transcribe(self, wav_path: str, language: str | None = None) -> str:
self._load()
segments, _info = self._model.transcribe( # type: ignore[union-attr]
wav_path,
language=language or self.cfg.language,
vad_filter=self.cfg.vad_filter,
)
parts = []
for seg in segments:
text = (seg.text or "").strip()
if text:
parts.append(text)
return " ".join(parts).strip()

View file

@ -1,12 +0,0 @@
You are an amanuensis. Rewrite the user's dictated text into clean, grammatical prose.
Rules:
- Remove filler words (um/uh/like), false starts, and self-corrections.
- Keep meaning, facts, and intent.
- Prefer concise sentences.
- Do not add new info.
- Output ONLY the cleaned text, no commentary.
Examples:
- "schedule that for 5 PM, I mean 4 PM" -> "schedule that for 4 PM"
- "let's ask Bob, I mean Janice, let's ask Janice" -> "let's ask Janice"

View file

@ -1,62 +0,0 @@
from __future__ import annotations
import gi
gi.require_version("Gtk", "3.0")
from gi.repository import GLib, Gtk
from pathlib import Path
class Tray:
def __init__(self, state_getter, on_quit):
self.state_getter = state_getter
self.on_quit = on_quit
self.base = Path(__file__).parent / "assets"
self.icon = Gtk.StatusIcon()
self.icon.set_visible(True)
self.icon.connect("popup-menu", self._on_menu)
self.menu = Gtk.Menu()
quit_item = Gtk.MenuItem(label="Quit")
quit_item.connect("activate", lambda *_: self.on_quit())
self.menu.append(quit_item)
self.menu.show_all()
def _on_menu(self, _icon, _button, _time):
self.menu.popup(None, None, None, None, 0, _time)
def _icon_path(self, state: str) -> str:
if state == "recording":
return str(self.base / "recording.png")
if state == "stt":
return str(self.base / "transcribing.png")
if state == "processing":
return str(self.base / "processing.png")
return str(self.base / "idle.png")
def _title(self, state: str) -> str:
if state == "recording":
return "Recording"
if state == "stt":
return "STT"
if state == "processing":
return "AI Processing"
return "Idle"
def update(self):
state = self.state_getter()
self.icon.set_from_file(self._icon_path(state))
self.icon.set_tooltip_text(self._title(state))
return True
def run_tray(state_getter, on_quit, on_settings):
tray = Tray(state_getter, on_quit)
tray.update()
GLib.timeout_add(250, tray.update)
if on_settings:
settings_item = Gtk.MenuItem(label="Settings")
settings_item.connect("activate", lambda *_: on_settings())
tray.menu.prepend(settings_item)
tray.menu.show_all()
Gtk.main()