Inline STT and tray
This commit is contained in:
parent
8c68719041
commit
4e8edc3e40
8 changed files with 109 additions and 171 deletions
|
|
@ -4,7 +4,7 @@
|
|||
|
||||
- `src/leld.py` is the primary entrypoint (X11 STT daemon).
|
||||
- `src/recorder.py` handles audio capture using PortAudio via `sounddevice`.
|
||||
- `src/stt.py` wraps faster-whisper for STT.
|
||||
- `src/leld.py` owns Whisper setup and transcription.
|
||||
|
||||
## Build, Test, and Development Commands
|
||||
|
||||
|
|
|
|||
|
|
@ -40,7 +40,6 @@ Create `~/.config/lel/config.json`:
|
|||
|
||||
"ai_cleanup": {
|
||||
"model": "llama3.2:3b",
|
||||
"temperature": 0.0,
|
||||
"base_url": "http://localhost:11434",
|
||||
"api_key": ""
|
||||
}
|
||||
|
|
|
|||
|
|
@ -5,21 +5,27 @@ import json
|
|||
import logging
|
||||
import sys
|
||||
import urllib.request
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
def load_system_prompt(path: str | None) -> str:
|
||||
if path:
|
||||
return Path(path).read_text(encoding="utf-8").strip()
|
||||
return (Path(__file__).parent / "system_prompt.txt").read_text(encoding="utf-8").strip()
|
||||
SYSTEM_PROMPT = (
|
||||
"You are an amanuensis. Rewrite the user's dictated text into clean, grammatical prose.\n\n"
|
||||
"Rules:\n"
|
||||
"- Remove filler words (um/uh/like), false starts, and self-corrections.\n"
|
||||
"- Keep meaning, facts, and intent.\n"
|
||||
"- Prefer concise sentences.\n"
|
||||
"- Do not add new info.\n"
|
||||
"- Output ONLY the cleaned text, no commentary.\n\n"
|
||||
"Examples:\n"
|
||||
" - \"schedule that for 5 PM, I mean 4 PM\" -> \"schedule that for 4 PM\"\n"
|
||||
" - \"let's ask Bob, I mean Janice, let's ask Janice\" -> \"let's ask Janice\"\n"
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class AIConfig:
|
||||
model: str
|
||||
temperature: float
|
||||
system_prompt_file: str
|
||||
base_url: str
|
||||
api_key: str
|
||||
timeout_sec: int
|
||||
|
|
@ -30,7 +36,7 @@ class AIConfig:
|
|||
class GenericAPIProcessor:
|
||||
def __init__(self, cfg: AIConfig):
|
||||
self.cfg = cfg
|
||||
self.system = load_system_prompt(cfg.system_prompt_file)
|
||||
self.system = SYSTEM_PROMPT
|
||||
|
||||
def process(self, text: str) -> str:
|
||||
language = self.cfg.language_hint or ""
|
||||
|
|
@ -46,7 +52,7 @@ class GenericAPIProcessor:
|
|||
{"role": "system", "content": self.system},
|
||||
{"role": "user", "content": user_content},
|
||||
],
|
||||
"temperature": self.cfg.temperature,
|
||||
"temperature": 0.0,
|
||||
}
|
||||
data = json.dumps(payload).encode("utf-8")
|
||||
url = _chat_completions_url(self.cfg.base_url)
|
||||
|
|
@ -101,6 +107,10 @@ def list_models(base_url: str, api_key: str = "", timeout_sec: int = 10) -> list
|
|||
return []
|
||||
|
||||
|
||||
def load_system_prompt(_path: str | None = None) -> str:
|
||||
return SYSTEM_PROMPT
|
||||
|
||||
|
||||
def _models_url(base_url: str) -> str:
|
||||
root = _root_url(base_url)
|
||||
return root.rstrip("/") + "/v1/models"
|
||||
|
|
@ -149,14 +159,11 @@ def main() -> int:
|
|||
json.dumps(redacted_dict(cfg), indent=2),
|
||||
)
|
||||
|
||||
prompt = load_system_prompt("")
|
||||
logging.info("system prompt:\n%s", prompt)
|
||||
logging.info("system prompt:\n%s", SYSTEM_PROMPT)
|
||||
|
||||
processor = build_processor(
|
||||
AIConfig(
|
||||
model=cfg.ai_cleanup.get("model", ""),
|
||||
temperature=cfg.ai_cleanup.get("temperature", 0.0),
|
||||
system_prompt_file="",
|
||||
base_url=cfg.ai_cleanup.get("base_url", ""),
|
||||
api_key=cfg.ai_cleanup.get("api_key", ""),
|
||||
timeout_sec=25,
|
||||
|
|
|
|||
|
|
@ -12,7 +12,6 @@ class Config:
|
|||
ai_cleanup: dict = field(
|
||||
default_factory=lambda: {
|
||||
"model": "llama3.2:3b",
|
||||
"temperature": 0.0,
|
||||
"base_url": "http://localhost:11434",
|
||||
"api_key": "",
|
||||
}
|
||||
|
|
@ -30,7 +29,7 @@ def load(path: str | None) -> Config:
|
|||
p = Path(path) if path else default_path()
|
||||
if p.exists():
|
||||
data = json.loads(p.read_text(encoding="utf-8"))
|
||||
if any(k in data for k in ("daemon", "recording", "stt", "injection", "ai_cleanup", "ai")):
|
||||
if any(k in data for k in ("daemon", "recording", "stt", "injection", "ai_cleanup")):
|
||||
for k, v in data.items():
|
||||
if hasattr(cfg, k):
|
||||
setattr(cfg, k, v)
|
||||
|
|
@ -41,7 +40,6 @@ def load(path: str | None) -> Config:
|
|||
cfg.stt["device"] = data.get("whisper_device", cfg.stt["device"])
|
||||
cfg.injection["backend"] = data.get("injection_backend", cfg.injection["backend"])
|
||||
cfg.ai_cleanup["model"] = data.get("ai_model", cfg.ai_cleanup["model"])
|
||||
cfg.ai_cleanup["temperature"] = data.get("ai_temperature", cfg.ai_cleanup["temperature"])
|
||||
cfg.ai_cleanup["base_url"] = data.get("ai_base_url", cfg.ai_cleanup["base_url"])
|
||||
cfg.ai_cleanup["api_key"] = data.get("ai_api_key", cfg.ai_cleanup["api_key"])
|
||||
|
||||
|
|
@ -56,20 +54,9 @@ def load(path: str | None) -> Config:
|
|||
if not isinstance(cfg.ai_cleanup, dict):
|
||||
cfg.ai_cleanup = {
|
||||
"model": "llama3.2:3b",
|
||||
"temperature": 0.0,
|
||||
"base_url": "http://localhost:11434",
|
||||
"api_key": "",
|
||||
}
|
||||
legacy_ai = getattr(cfg, "ai", None)
|
||||
if isinstance(legacy_ai, dict) and not cfg.ai_cleanup:
|
||||
cfg.ai_cleanup = legacy_ai
|
||||
try:
|
||||
delattr(cfg, "ai")
|
||||
except AttributeError:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
validate(cfg)
|
||||
return cfg
|
||||
|
||||
|
|
|
|||
107
src/leld.py
107
src/leld.py
|
|
@ -9,13 +9,18 @@ import threading
|
|||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import gi
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
from config import Config, load, redacted_dict
|
||||
from recorder import start_recording, stop_recording
|
||||
from stt import FasterWhisperSTT, STTConfig
|
||||
from aiprocess import AIConfig, build_processor
|
||||
from inject import inject
|
||||
from x11_hotkey import listen
|
||||
from tray import run_tray
|
||||
|
||||
gi.require_version("Gtk", "3.0")
|
||||
|
||||
from gi.repository import GLib, Gtk # type: ignore[import-not-found]
|
||||
|
||||
|
||||
class State:
|
||||
|
|
@ -26,6 +31,13 @@ class State:
|
|||
OUTPUTTING = "outputting"
|
||||
|
||||
|
||||
def _compute_type(device: str) -> str:
|
||||
dev = (device or "cpu").lower()
|
||||
if dev == "cuda":
|
||||
return "float16"
|
||||
return "int8"
|
||||
|
||||
|
||||
class Daemon:
|
||||
def __init__(self, cfg: Config):
|
||||
self.cfg = cfg
|
||||
|
|
@ -34,15 +46,12 @@ class Daemon:
|
|||
self.proc = None
|
||||
self.record = None
|
||||
self.timer = None
|
||||
self.stt = FasterWhisperSTT(
|
||||
STTConfig(
|
||||
model=cfg.stt.get("model", "base"),
|
||||
language=None,
|
||||
device=cfg.stt.get("device", "cpu"),
|
||||
vad_filter=True,
|
||||
)
|
||||
self.model = WhisperModel(
|
||||
cfg.stt.get("model", "base"),
|
||||
device=cfg.stt.get("device", "cpu"),
|
||||
compute_type=_compute_type(cfg.stt.get("device", "cpu")),
|
||||
)
|
||||
self.ai = None
|
||||
self.tray = _Tray(self.get_state, self._quit)
|
||||
|
||||
def set_state(self, state: str):
|
||||
with self.lock:
|
||||
|
|
@ -55,6 +64,9 @@ class Daemon:
|
|||
with self.lock:
|
||||
return self.state
|
||||
|
||||
def _quit(self):
|
||||
os._exit(0)
|
||||
|
||||
def toggle(self):
|
||||
with self.lock:
|
||||
if self.state == State.IDLE:
|
||||
|
|
@ -118,7 +130,7 @@ class Daemon:
|
|||
try:
|
||||
self.set_state(State.STT)
|
||||
logging.info("stt started")
|
||||
text = self.stt.transcribe(record.wav_path, language="en")
|
||||
text = self._transcribe(record.wav_path)
|
||||
except Exception as exc:
|
||||
logging.error("stt failed: %s", exc)
|
||||
self.set_state(State.IDLE)
|
||||
|
|
@ -131,7 +143,6 @@ class Daemon:
|
|||
|
||||
logging.info("stt: %s", text)
|
||||
|
||||
ai_prompt_file = ""
|
||||
ai_model = (self.cfg.ai_cleanup.get("model") or "").strip()
|
||||
ai_base_url = (self.cfg.ai_cleanup.get("base_url") or "").strip()
|
||||
if ai_model and ai_base_url:
|
||||
|
|
@ -141,8 +152,6 @@ class Daemon:
|
|||
processor = build_processor(
|
||||
AIConfig(
|
||||
model=ai_model,
|
||||
temperature=self.cfg.ai_cleanup.get("temperature", 0.0),
|
||||
system_prompt_file=ai_prompt_file,
|
||||
base_url=ai_base_url,
|
||||
api_key=self.cfg.ai_cleanup.get("api_key", ""),
|
||||
timeout_sec=25,
|
||||
|
|
@ -174,6 +183,69 @@ class Daemon:
|
|||
self.state = State.STT
|
||||
threading.Thread(target=self._stop_and_process, daemon=True).start()
|
||||
|
||||
def _transcribe(self, wav_path: str) -> str:
|
||||
segments, _info = self.model.transcribe(
|
||||
wav_path,
|
||||
language=None,
|
||||
vad_filter=True,
|
||||
)
|
||||
parts = []
|
||||
for seg in segments:
|
||||
text = (seg.text or "").strip()
|
||||
if text:
|
||||
parts.append(text)
|
||||
return " ".join(parts).strip()
|
||||
|
||||
def run_tray(self):
|
||||
self.tray.run()
|
||||
|
||||
|
||||
class _Tray:
|
||||
def __init__(self, state_getter, on_quit):
|
||||
self.state_getter = state_getter
|
||||
self.on_quit = on_quit
|
||||
self.base = Path(__file__).parent / "assets"
|
||||
self.icon = Gtk.StatusIcon()
|
||||
self.icon.set_visible(True)
|
||||
self.icon.connect("popup-menu", self._on_menu)
|
||||
self.menu = Gtk.Menu()
|
||||
quit_item = Gtk.MenuItem(label="Quit")
|
||||
quit_item.connect("activate", lambda *_: self.on_quit())
|
||||
self.menu.append(quit_item)
|
||||
self.menu.show_all()
|
||||
|
||||
def _on_menu(self, _icon, _button, _time):
|
||||
self.menu.popup(None, None, None, None, 0, _time)
|
||||
|
||||
def _icon_path(self, state: str) -> str:
|
||||
if state == State.RECORDING:
|
||||
return str(self.base / "recording.png")
|
||||
if state == State.STT:
|
||||
return str(self.base / "transcribing.png")
|
||||
if state == State.PROCESSING:
|
||||
return str(self.base / "processing.png")
|
||||
return str(self.base / "idle.png")
|
||||
|
||||
def _title(self, state: str) -> str:
|
||||
if state == State.RECORDING:
|
||||
return "Recording"
|
||||
if state == State.STT:
|
||||
return "STT"
|
||||
if state == State.PROCESSING:
|
||||
return "AI Processing"
|
||||
return "Idle"
|
||||
|
||||
def update(self):
|
||||
state = self.state_getter()
|
||||
self.icon.set_from_file(self._icon_path(state))
|
||||
self.icon.set_tooltip_text(self._title(state))
|
||||
return True
|
||||
|
||||
def run(self):
|
||||
self.update()
|
||||
GLib.timeout_add(250, self.update)
|
||||
Gtk.main()
|
||||
|
||||
|
||||
def _lock_single_instance():
|
||||
runtime_dir = Path(os.getenv("XDG_RUNTIME_DIR", "/tmp")) / "lel"
|
||||
|
|
@ -199,8 +271,6 @@ def main():
|
|||
|
||||
logging.basicConfig(stream=sys.stderr, level=logging.INFO, format="leld: %(asctime)s %(message)s")
|
||||
cfg = load(args.config)
|
||||
config_path = Path(args.config) if args.config else Path.home() / ".config" / "lel" / "config.json"
|
||||
|
||||
_lock_single_instance()
|
||||
|
||||
logging.info("ready (hotkey: %s)", cfg.daemon.get("hotkey", ""))
|
||||
|
|
@ -208,9 +278,6 @@ def main():
|
|||
|
||||
daemon = Daemon(cfg)
|
||||
|
||||
def on_quit():
|
||||
os._exit(0)
|
||||
|
||||
def handle_signal(_sig, _frame):
|
||||
logging.info("signal received, shutting down")
|
||||
daemon.stop_recording()
|
||||
|
|
@ -236,7 +303,7 @@ def main():
|
|||
),
|
||||
daemon=True,
|
||||
).start()
|
||||
run_tray(daemon.get_state, on_quit, None)
|
||||
daemon.run_tray()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
48
src/stt.py
48
src/stt.py
|
|
@ -1,48 +0,0 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
|
||||
@dataclass
|
||||
class STTConfig:
|
||||
model: str
|
||||
language: str | None
|
||||
device: str
|
||||
vad_filter: bool
|
||||
|
||||
|
||||
def _compute_type(device: str) -> str:
|
||||
dev = (device or "cpu").lower()
|
||||
if dev == "cuda":
|
||||
return "float16"
|
||||
return "int8"
|
||||
|
||||
|
||||
class FasterWhisperSTT:
|
||||
def __init__(self, cfg: STTConfig):
|
||||
self.cfg = cfg
|
||||
self._model: WhisperModel | None = None
|
||||
|
||||
def _load(self):
|
||||
if self._model is None:
|
||||
self._model = WhisperModel(
|
||||
self.cfg.model,
|
||||
device=self.cfg.device or "cpu",
|
||||
compute_type=_compute_type(self.cfg.device),
|
||||
)
|
||||
|
||||
def transcribe(self, wav_path: str, language: str | None = None) -> str:
|
||||
self._load()
|
||||
segments, _info = self._model.transcribe( # type: ignore[union-attr]
|
||||
wav_path,
|
||||
language=language or self.cfg.language,
|
||||
vad_filter=self.cfg.vad_filter,
|
||||
)
|
||||
parts = []
|
||||
for seg in segments:
|
||||
text = (seg.text or "").strip()
|
||||
if text:
|
||||
parts.append(text)
|
||||
return " ".join(parts).strip()
|
||||
|
|
@ -1,12 +0,0 @@
|
|||
You are an amanuensis. Rewrite the user's dictated text into clean, grammatical prose.
|
||||
|
||||
Rules:
|
||||
- Remove filler words (um/uh/like), false starts, and self-corrections.
|
||||
- Keep meaning, facts, and intent.
|
||||
- Prefer concise sentences.
|
||||
- Do not add new info.
|
||||
- Output ONLY the cleaned text, no commentary.
|
||||
|
||||
Examples:
|
||||
- "schedule that for 5 PM, I mean 4 PM" -> "schedule that for 4 PM"
|
||||
- "let's ask Bob, I mean Janice, let's ask Janice" -> "let's ask Janice"
|
||||
62
src/tray.py
62
src/tray.py
|
|
@ -1,62 +0,0 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import gi
|
||||
|
||||
gi.require_version("Gtk", "3.0")
|
||||
|
||||
from gi.repository import GLib, Gtk
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class Tray:
|
||||
def __init__(self, state_getter, on_quit):
|
||||
self.state_getter = state_getter
|
||||
self.on_quit = on_quit
|
||||
self.base = Path(__file__).parent / "assets"
|
||||
self.icon = Gtk.StatusIcon()
|
||||
self.icon.set_visible(True)
|
||||
self.icon.connect("popup-menu", self._on_menu)
|
||||
self.menu = Gtk.Menu()
|
||||
quit_item = Gtk.MenuItem(label="Quit")
|
||||
quit_item.connect("activate", lambda *_: self.on_quit())
|
||||
self.menu.append(quit_item)
|
||||
self.menu.show_all()
|
||||
|
||||
def _on_menu(self, _icon, _button, _time):
|
||||
self.menu.popup(None, None, None, None, 0, _time)
|
||||
|
||||
def _icon_path(self, state: str) -> str:
|
||||
if state == "recording":
|
||||
return str(self.base / "recording.png")
|
||||
if state == "stt":
|
||||
return str(self.base / "transcribing.png")
|
||||
if state == "processing":
|
||||
return str(self.base / "processing.png")
|
||||
return str(self.base / "idle.png")
|
||||
|
||||
def _title(self, state: str) -> str:
|
||||
if state == "recording":
|
||||
return "Recording"
|
||||
if state == "stt":
|
||||
return "STT"
|
||||
if state == "processing":
|
||||
return "AI Processing"
|
||||
return "Idle"
|
||||
|
||||
def update(self):
|
||||
state = self.state_getter()
|
||||
self.icon.set_from_file(self._icon_path(state))
|
||||
self.icon.set_tooltip_text(self._title(state))
|
||||
return True
|
||||
|
||||
|
||||
def run_tray(state_getter, on_quit, on_settings):
|
||||
tray = Tray(state_getter, on_quit)
|
||||
tray.update()
|
||||
GLib.timeout_add(250, tray.update)
|
||||
if on_settings:
|
||||
settings_item = Gtk.MenuItem(label="Settings")
|
||||
settings_item.connect("activate", lambda *_: on_settings())
|
||||
tray.menu.prepend(settings_item)
|
||||
tray.menu.show_all()
|
||||
Gtk.main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue