From ebba452268b80e692f8cd106e392ab874543b81b Mon Sep 17 00:00:00 2001 From: Thales Maciel Date: Tue, 24 Feb 2026 11:48:02 -0300 Subject: [PATCH] Use in-memory audio for STT --- README.md | 5 ++--- pyproject.toml | 1 - src/leld.py | 12 ++++++------ src/recorder.py | 27 +++++++++------------------ uv.lock | 22 ---------------------- 5 files changed, 17 insertions(+), 50 deletions(-) diff --git a/README.md b/README.md index 3a69e0f..69f5613 100644 --- a/README.md +++ b/README.md @@ -6,12 +6,11 @@ Python X11 STT daemon that records audio, runs Whisper, logs the transcript, and - X11 (not Wayland) - `sounddevice` (PortAudio) -- `soundfile` (libsndfile) - `faster-whisper` - Tray icon deps: `gtk3` -- Python deps: `pillow`, `python-xlib`, `faster-whisper`, `PyGObject`, `sounddevice`, `soundfile` +- Python deps: `pillow`, `python-xlib`, `faster-whisper`, `PyGObject`, `sounddevice` -System packages (example names): `portaudio`/`libportaudio2` and `libsndfile`. +System packages (example names): `portaudio`/`libportaudio2`. ## Python Daemon diff --git a/pyproject.toml b/pyproject.toml index 546c6f0..90af824 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,6 @@ dependencies = [ "python-xlib", "PyGObject", "sounddevice", - "soundfile", ] [tool.uv] diff --git a/src/leld.py b/src/leld.py index 8d25430..d08ae71 100755 --- a/src/leld.py +++ b/src/leld.py @@ -103,7 +103,7 @@ class Daemon: self.proc = proc self.record = record self.state = State.RECORDING - logging.info("recording started (%s)", record.wav_path) + logging.info("recording started") if self.timer: self.timer.cancel() self.timer = threading.Timer(RECORD_TIMEOUT_SEC, self._timeout_stop) @@ -132,13 +132,13 @@ class Daemon: logging.info("stopping recording (user)") try: - stop_recording(proc, record) + audio = stop_recording(proc, record) except Exception as exc: logging.error("record stop failed: %s", exc) self.set_state(State.IDLE) return - if not Path(record.wav_path).exists(): + if audio.size == 0: logging.error("no audio captured") self.set_state(State.IDLE) return @@ -146,7 +146,7 @@ class Daemon: try: self.set_state(State.STT) logging.info("stt started") - text = self._transcribe(record.wav_path) + text = self._transcribe(audio) except Exception as exc: logging.error("stt failed: %s", exc) self.set_state(State.IDLE) @@ -199,8 +199,8 @@ class Daemon: self.state = State.STT threading.Thread(target=self._stop_and_process, daemon=True).start() - def _transcribe(self, wav_path: str) -> str: - segments, _info = self.model.transcribe(wav_path, language=STT_LANGUAGE, vad_filter=True) + def _transcribe(self, audio) -> str: + segments, _info = self.model.transcribe(audio, language=STT_LANGUAGE, vad_filter=True) parts = [] for seg in segments: text = (seg.text or "").strip() diff --git a/src/recorder.py b/src/recorder.py index 3fc80b1..4b2054c 100644 --- a/src/recorder.py +++ b/src/recorder.py @@ -1,21 +1,16 @@ -import tempfile from dataclasses import dataclass, field -from pathlib import Path from typing import Iterable import numpy as np import sounddevice as sd # type: ignore[import-not-found] -import soundfile as sf # type: ignore[import-not-found] @dataclass class RecordResult: - wav_path: str - temp_dir: str frames: list[np.ndarray] = field(default_factory=list) samplerate: int = 16000 channels: int = 1 - dtype: str = "int16" + dtype: str = "float32" def list_input_devices() -> list[dict]: @@ -54,9 +49,7 @@ def resolve_input_device(spec: str | int | None) -> int | None: def start_recording(input_spec: str | int | None) -> tuple[sd.InputStream, RecordResult]: - tmpdir = tempfile.mkdtemp(prefix="lel-") - wav = str(Path(tmpdir) / "mic.wav") - record = RecordResult(wav_path=wav, temp_dir=tmpdir) + record = RecordResult() device = resolve_input_device(input_spec) def callback(indata, _frames, _time, _status): @@ -73,20 +66,18 @@ def start_recording(input_spec: str | int | None) -> tuple[sd.InputStream, Recor return stream, record -def stop_recording(stream: sd.InputStream, record: RecordResult) -> None: +def stop_recording(stream: sd.InputStream, record: RecordResult) -> np.ndarray: if stream: stream.stop() stream.close() - _write_wav(record) - - -def _write_wav(record: RecordResult) -> None: - data = _flatten_frames(record.frames) - sf.write(record.wav_path, data, record.samplerate, subtype="PCM_16") + return _flatten_frames(record.frames) def _flatten_frames(frames: Iterable[np.ndarray]) -> np.ndarray: frames = list(frames) if not frames: - return np.zeros((0, 1), dtype=np.int16) - return np.concatenate(frames, axis=0) + return np.zeros((0,), dtype=np.float32) + data = np.concatenate(frames, axis=0) + if data.ndim > 1: + data = np.squeeze(data, axis=-1) + return np.asarray(data, dtype=np.float32).reshape(-1) diff --git a/uv.lock b/uv.lock index c35c48a..772833c 100644 --- a/uv.lock +++ b/uv.lock @@ -392,7 +392,6 @@ dependencies = [ { name = "pygobject" }, { name = "python-xlib" }, { name = "sounddevice" }, - { name = "soundfile" }, ] [package.metadata] @@ -402,7 +401,6 @@ requires-dist = [ { name = "pygobject" }, { name = "python-xlib" }, { name = "sounddevice" }, - { name = "soundfile" }, ] [[package]] @@ -880,26 +878,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/4e/39/a61d4b83a7746b70d23d9173be688c0c6bfc7173772344b7442c2c155497/sounddevice-0.5.5-py3-none-win_arm64.whl", hash = "sha256:3861901ddd8230d2e0e8ae62ac320cdd4c688d81df89da036dcb812f757bb3e6", size = 317115, upload-time = "2026-01-23T18:36:42.235Z" }, ] -[[package]] -name = "soundfile" -version = "0.13.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "cffi" }, - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, - { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/e1/41/9b873a8c055582859b239be17902a85339bec6a30ad162f98c9b0288a2cc/soundfile-0.13.1.tar.gz", hash = "sha256:b2c68dab1e30297317080a5b43df57e302584c49e2942defdde0acccc53f0e5b", size = 46156, upload-time = "2025-01-25T09:17:04.831Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/64/28/e2a36573ccbcf3d57c00626a21fe51989380636e821b341d36ccca0c1c3a/soundfile-0.13.1-py2.py3-none-any.whl", hash = "sha256:a23c717560da2cf4c7b5ae1142514e0fd82d6bbd9dfc93a50423447142f2c445", size = 25751, upload-time = "2025-01-25T09:16:44.235Z" }, - { url = "https://files.pythonhosted.org/packages/ea/ab/73e97a5b3cc46bba7ff8650a1504348fa1863a6f9d57d7001c6b67c5f20e/soundfile-0.13.1-py2.py3-none-macosx_10_9_x86_64.whl", hash = "sha256:82dc664d19831933fe59adad199bf3945ad06d84bc111a5b4c0d3089a5b9ec33", size = 1142250, upload-time = "2025-01-25T09:16:47.583Z" }, - { url = "https://files.pythonhosted.org/packages/a0/e5/58fd1a8d7b26fc113af244f966ee3aecf03cb9293cb935daaddc1e455e18/soundfile-0.13.1-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:743f12c12c4054921e15736c6be09ac26b3b3d603aef6fd69f9dde68748f2593", size = 1101406, upload-time = "2025-01-25T09:16:49.662Z" }, - { url = "https://files.pythonhosted.org/packages/58/ae/c0e4a53d77cf6e9a04179535766b3321b0b9ced5f70522e4caf9329f0046/soundfile-0.13.1-py2.py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:9c9e855f5a4d06ce4213f31918653ab7de0c5a8d8107cd2427e44b42df547deb", size = 1235729, upload-time = "2025-01-25T09:16:53.018Z" }, - { url = "https://files.pythonhosted.org/packages/57/5e/70bdd9579b35003a489fc850b5047beeda26328053ebadc1fb60f320f7db/soundfile-0.13.1-py2.py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:03267c4e493315294834a0870f31dbb3b28a95561b80b134f0bd3cf2d5f0e618", size = 1313646, upload-time = "2025-01-25T09:16:54.872Z" }, - { url = "https://files.pythonhosted.org/packages/fe/df/8c11dc4dfceda14e3003bb81a0d0edcaaf0796dd7b4f826ea3e532146bba/soundfile-0.13.1-py2.py3-none-win32.whl", hash = "sha256:c734564fab7c5ddf8e9be5bf70bab68042cd17e9c214c06e365e20d64f9a69d5", size = 899881, upload-time = "2025-01-25T09:16:56.663Z" }, - { url = "https://files.pythonhosted.org/packages/14/e9/6b761de83277f2f02ded7e7ea6f07828ec78e4b229b80e4ca55dd205b9dc/soundfile-0.13.1-py2.py3-none-win_amd64.whl", hash = "sha256:1e70a05a0626524a69e9f0f4dd2ec174b4e9567f4d8b6c11d38b5c289be36ee9", size = 1019162, upload-time = "2025-01-25T09:16:59.573Z" }, -] - [[package]] name = "sympy" version = "1.14.0"