From bb7780c461defe1af741d164ebd8e3b22ae3949e Mon Sep 17 00:00:00 2001 From: Thales Maciel Date: Sat, 7 Feb 2026 15:21:33 -0300 Subject: [PATCH] Switch to faster-whisper --- README.md | 6 +++--- requirements.txt | 2 +- src/leld.py | 11 ++++++++-- src/stt.py | 53 ++++++++++++++++++++++++++++++++++-------------- 4 files changed, 51 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 070b541..f684645 100644 --- a/README.md +++ b/README.md @@ -6,18 +6,18 @@ Python X11 transcription daemon that records audio, runs Whisper, logs the trans - X11 (not Wayland) - `ffmpeg` -- `whisper` (OpenAI Whisper CLI) +- `faster-whisper` - `xclip` - `xdotool` - Tray icon deps: `libappindicator3` and `gtk3` (required by `systray`) -- Python deps: `pystray`, `pillow`, `python-xlib`, `ollama`, `openai-whisper` +- Python deps: `pystray`, `pillow`, `python-xlib`, `ollama`, `faster-whisper` ## Python Daemon Install Python deps: ```bash -pip install -r src/requirements.txt +pip install -r requirements.txt ``` Run: diff --git a/requirements.txt b/requirements.txt index 848f983..6991611 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -openai-whisper +faster-whisper ollama pystray pillow diff --git a/src/leld.py b/src/leld.py index e07da8f..bb8fa34 100755 --- a/src/leld.py +++ b/src/leld.py @@ -11,7 +11,7 @@ from pathlib import Path from config import Config, load, redacted_dict from recorder import start_recording, stop_recording -from stt import WhisperSTT +from stt import FasterWhisperSTT, STTConfig from aiprocess import AIConfig, build_processor from inject import inject from x11_hotkey import listen @@ -34,7 +34,14 @@ class Daemon: self.proc = None self.record = None self.timer = None - self.stt = WhisperSTT(cfg.whisper_model, cfg.whisper_lang, cfg.whisper_device) + self.stt = FasterWhisperSTT( + STTConfig( + model=cfg.whisper_model, + language=cfg.whisper_lang, + device=cfg.whisper_device, + vad_filter=True, + ) + ) self.ai = None if cfg.ai_enabled: self.ai = build_processor( diff --git a/src/stt.py b/src/stt.py index 638dc97..576f44c 100644 --- a/src/stt.py +++ b/src/stt.py @@ -1,25 +1,48 @@ -import os -import whisper +from __future__ import annotations + +from dataclasses import dataclass + +from faster_whisper import WhisperModel -def _force_cpu(): - os.environ.setdefault("CUDA_VISIBLE_DEVICES", "") +@dataclass +class STTConfig: + model: str + language: str | None + device: str + vad_filter: bool -class WhisperSTT: - def __init__(self, model: str, language: str | None = None, device: str = "cpu"): - self.model_name = model - self.language = language - self.device = (device or "cpu").lower() - self._model = None +def _compute_type(device: str) -> str: + dev = (device or "cpu").lower() + if dev == "cuda": + return "float16" + return "int8" + + +class FasterWhisperSTT: + def __init__(self, cfg: STTConfig): + self.cfg = cfg + self._model: WhisperModel | None = None def _load(self): if self._model is None: - if self.device == "cpu": - _force_cpu() - self._model = whisper.load_model(self.model_name, device=self.device) + self._model = WhisperModel( + self.cfg.model, + device=self.cfg.device or "cpu", + compute_type=_compute_type(self.cfg.device), + ) def transcribe(self, wav_path: str) -> str: self._load() - result = self._model.transcribe(wav_path, language=self.language) - return (result.get("text") or "").strip() + segments, _info = self._model.transcribe( + wav_path, + language=self.cfg.language, + vad_filter=self.cfg.vad_filter, + ) + parts = [] + for seg in segments: + text = (seg.text or "").strip() + if text: + parts.append(text) + return " ".join(parts).strip()