Use in-memory audio for STT

2026-02-24 11:48:02 -03:00 · 2026-02-24 11:48:02 -03:00 · ebba452268
commit ebba452268
parent 861f199dea
5 changed files with 17 additions and 50 deletions
--- a/README.md
+++ b/README.md
@ -6,12 +6,11 @@ Python X11 STT daemon that records audio, runs Whisper, logs the transcript, and

 - X11 (not Wayland)
 - `sounddevice` (PortAudio)
- `soundfile` (libsndfile)
 - `faster-whisper`
 - Tray icon deps: `gtk3`
- Python deps: `pillow`, `python-xlib`, `faster-whisper`, `PyGObject`, `sounddevice`, `soundfile`
+- Python deps: `pillow`, `python-xlib`, `faster-whisper`, `PyGObject`, `sounddevice`

-System packages (example names): `portaudio`/`libportaudio2` and `libsndfile`.
+System packages (example names): `portaudio`/`libportaudio2`.

 ## Python Daemon

--- a/pyproject.toml
+++ b/pyproject.toml
@ -10,7 +10,6 @@ dependencies = [
  "python-xlib",
  "PyGObject",
  "sounddevice",
-  "soundfile",
 ]

 [tool.uv]
--- a/src/leld.py
+++ b/src/leld.py
@ -103,7 +103,7 @@ class Daemon:
        self.proc = proc
        self.record = record
        self.state = State.RECORDING
-        logging.info("recording started (%s)", record.wav_path)
+        logging.info("recording started")
        if self.timer:
            self.timer.cancel()
        self.timer = threading.Timer(RECORD_TIMEOUT_SEC, self._timeout_stop)
@ -132,13 +132,13 @@ class Daemon:

        logging.info("stopping recording (user)")
        try:
-            stop_recording(proc, record)
+            audio = stop_recording(proc, record)
        except Exception as exc:
            logging.error("record stop failed: %s", exc)
            self.set_state(State.IDLE)
            return

-        if not Path(record.wav_path).exists():
+        if audio.size == 0:
            logging.error("no audio captured")
            self.set_state(State.IDLE)
            return
@ -146,7 +146,7 @@ class Daemon:
        try:
            self.set_state(State.STT)
            logging.info("stt started")
-            text = self._transcribe(record.wav_path)
+            text = self._transcribe(audio)
        except Exception as exc:
            logging.error("stt failed: %s", exc)
            self.set_state(State.IDLE)
@ -199,8 +199,8 @@ class Daemon:
            self.state = State.STT
        threading.Thread(target=self._stop_and_process, daemon=True).start()

-    def _transcribe(self, wav_path: str) -> str:
-        segments, _info = self.model.transcribe(wav_path, language=STT_LANGUAGE, vad_filter=True)
+    def _transcribe(self, audio) -> str:
+        segments, _info = self.model.transcribe(audio, language=STT_LANGUAGE, vad_filter=True)
        parts = []
        for seg in segments:
            text = (seg.text or "").strip()
--- a/src/recorder.py
+++ b/src/recorder.py
@ -1,21 +1,16 @@
-import tempfile
 from dataclasses import dataclass, field
-from pathlib import Path
 from typing import Iterable

 import numpy as np
 import sounddevice as sd  # type: ignore[import-not-found]
-import soundfile as sf  # type: ignore[import-not-found]


@dataclass
 class RecordResult:
-    wav_path: str
-    temp_dir: str
    frames: list[np.ndarray] = field(default_factory=list)
    samplerate: int = 16000
    channels: int = 1
-    dtype: str = "int16"
+    dtype: str = "float32"


 def list_input_devices() -> list[dict]:
@ -54,9 +49,7 @@ def resolve_input_device(spec: str | int | None) -> int | None:


 def start_recording(input_spec: str | int | None) -> tuple[sd.InputStream, RecordResult]:
-    tmpdir = tempfile.mkdtemp(prefix="lel-")
-    wav = str(Path(tmpdir) / "mic.wav")
-    record = RecordResult(wav_path=wav, temp_dir=tmpdir)
+    record = RecordResult()
    device = resolve_input_device(input_spec)

    def callback(indata, _frames, _time, _status):
@ -73,20 +66,18 @@ def start_recording(input_spec: str | int | None) -> tuple[sd.InputStream, Recor
    return stream, record


-def stop_recording(stream: sd.InputStream, record: RecordResult) -> None:
+def stop_recording(stream: sd.InputStream, record: RecordResult) -> np.ndarray:
    if stream:
        stream.stop()
        stream.close()
-    _write_wav(record)
-
-
-def _write_wav(record: RecordResult) -> None:
-    data = _flatten_frames(record.frames)
-    sf.write(record.wav_path, data, record.samplerate, subtype="PCM_16")
+    return _flatten_frames(record.frames)


 def _flatten_frames(frames: Iterable[np.ndarray]) -> np.ndarray:
    frames = list(frames)
    if not frames:
-        return np.zeros((0, 1), dtype=np.int16)
-    return np.concatenate(frames, axis=0)
+        return np.zeros((0,), dtype=np.float32)
+    data = np.concatenate(frames, axis=0)
+    if data.ndim > 1:
+        data = np.squeeze(data, axis=-1)
+    return np.asarray(data, dtype=np.float32).reshape(-1)
--- a/uv.lock
+++ b/uv.lock
@ -392,7 +392,6 @@ dependencies = [
    { name = "pygobject" },
    { name = "python-xlib" },
    { name = "sounddevice" },
-    { name = "soundfile" },
 ]

 [package.metadata]
@ -402,7 +401,6 @@ requires-dist = [
    { name = "pygobject" },
    { name = "python-xlib" },
    { name = "sounddevice" },
-    { name = "soundfile" },
 ]

 [[package]]
@ -880,26 +878,6 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/4e/39/a61d4b83a7746b70d23d9173be688c0c6bfc7173772344b7442c2c155497/sounddevice-0.5.5-py3-none-win_arm64.whl", hash = "sha256:3861901ddd8230d2e0e8ae62ac320cdd4c688d81df89da036dcb812f757bb3e6", size = 317115, upload-time = "2026-01-23T18:36:42.235Z" },
 ]

-[[package]]
-name = "soundfile"
-version = "0.13.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "cffi" },
-    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
-    { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/e1/41/9b873a8c055582859b239be17902a85339bec6a30ad162f98c9b0288a2cc/soundfile-0.13.1.tar.gz", hash = "sha256:b2c68dab1e30297317080a5b43df57e302584c49e2942defdde0acccc53f0e5b", size = 46156, upload-time = "2025-01-25T09:17:04.831Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/64/28/e2a36573ccbcf3d57c00626a21fe51989380636e821b341d36ccca0c1c3a/soundfile-0.13.1-py2.py3-none-any.whl", hash = "sha256:a23c717560da2cf4c7b5ae1142514e0fd82d6bbd9dfc93a50423447142f2c445", size = 25751, upload-time = "2025-01-25T09:16:44.235Z" },
-    { url = "https://files.pythonhosted.org/packages/ea/ab/73e97a5b3cc46bba7ff8650a1504348fa1863a6f9d57d7001c6b67c5f20e/soundfile-0.13.1-py2.py3-none-macosx_10_9_x86_64.whl", hash = "sha256:82dc664d19831933fe59adad199bf3945ad06d84bc111a5b4c0d3089a5b9ec33", size = 1142250, upload-time = "2025-01-25T09:16:47.583Z" },
-    { url = "https://files.pythonhosted.org/packages/a0/e5/58fd1a8d7b26fc113af244f966ee3aecf03cb9293cb935daaddc1e455e18/soundfile-0.13.1-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:743f12c12c4054921e15736c6be09ac26b3b3d603aef6fd69f9dde68748f2593", size = 1101406, upload-time = "2025-01-25T09:16:49.662Z" },
-    { url = "https://files.pythonhosted.org/packages/58/ae/c0e4a53d77cf6e9a04179535766b3321b0b9ced5f70522e4caf9329f0046/soundfile-0.13.1-py2.py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:9c9e855f5a4d06ce4213f31918653ab7de0c5a8d8107cd2427e44b42df547deb", size = 1235729, upload-time = "2025-01-25T09:16:53.018Z" },
-    { url = "https://files.pythonhosted.org/packages/57/5e/70bdd9579b35003a489fc850b5047beeda26328053ebadc1fb60f320f7db/soundfile-0.13.1-py2.py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:03267c4e493315294834a0870f31dbb3b28a95561b80b134f0bd3cf2d5f0e618", size = 1313646, upload-time = "2025-01-25T09:16:54.872Z" },
-    { url = "https://files.pythonhosted.org/packages/fe/df/8c11dc4dfceda14e3003bb81a0d0edcaaf0796dd7b4f826ea3e532146bba/soundfile-0.13.1-py2.py3-none-win32.whl", hash = "sha256:c734564fab7c5ddf8e9be5bf70bab68042cd17e9c214c06e365e20d64f9a69d5", size = 899881, upload-time = "2025-01-25T09:16:56.663Z" },
-    { url = "https://files.pythonhosted.org/packages/14/e9/6b761de83277f2f02ded7e7ea6f07828ec78e4b229b80e4ca55dd205b9dc/soundfile-0.13.1-py2.py3-none-win_amd64.whl", hash = "sha256:1e70a05a0626524a69e9f0f4dd2ec174b4e9567f4d8b6c11d38b5c289be36ee9", size = 1019162, upload-time = "2025-01-25T09:16:59.573Z" },
-]
-
 [[package]]
 name = "sympy"
 version = "1.14.0"