From ebba452268b80e692f8cd106e392ab874543b81b Mon Sep 17 00:00:00 2001
From: Thales Maciel <thales@thalesmaciel.com>
Date: Tue, 24 Feb 2026 11:48:02 -0300
Subject: [PATCH] Use in-memory audio for STT

---
 README.md       |  5 ++---
 pyproject.toml  |  1 -
 src/leld.py     | 12 ++++++------
 src/recorder.py | 27 +++++++++------------------
 uv.lock         | 22 ----------------------
 5 files changed, 17 insertions(+), 50 deletions(-)

diff --git a/README.md b/README.md
index 3a69e0f..69f5613 100644
--- a/README.md
+++ b/README.md
@@ -6,12 +6,11 @@ Python X11 STT daemon that records audio, runs Whisper, logs the transcript, and
 
 - X11 (not Wayland)
 - `sounddevice` (PortAudio)
-- `soundfile` (libsndfile)
 - `faster-whisper`
 - Tray icon deps: `gtk3`
-- Python deps: `pillow`, `python-xlib`, `faster-whisper`, `PyGObject`, `sounddevice`, `soundfile`
+- Python deps: `pillow`, `python-xlib`, `faster-whisper`, `PyGObject`, `sounddevice`
 
-System packages (example names): `portaudio`/`libportaudio2` and `libsndfile`.
+System packages (example names): `portaudio`/`libportaudio2`.
 
 ## Python Daemon
 
diff --git a/pyproject.toml b/pyproject.toml
index 546c6f0..90af824 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,7 +10,6 @@ dependencies = [
   "python-xlib",
   "PyGObject",
   "sounddevice",
-  "soundfile",
 ]
 
 [tool.uv]
diff --git a/src/leld.py b/src/leld.py
index 8d25430..d08ae71 100755
--- a/src/leld.py
+++ b/src/leld.py
@@ -103,7 +103,7 @@ class Daemon:
         self.proc = proc
         self.record = record
         self.state = State.RECORDING
-        logging.info("recording started (%s)", record.wav_path)
+        logging.info("recording started")
         if self.timer:
             self.timer.cancel()
         self.timer = threading.Timer(RECORD_TIMEOUT_SEC, self._timeout_stop)
@@ -132,13 +132,13 @@ class Daemon:
 
         logging.info("stopping recording (user)")
         try:
-            stop_recording(proc, record)
+            audio = stop_recording(proc, record)
         except Exception as exc:
             logging.error("record stop failed: %s", exc)
             self.set_state(State.IDLE)
             return
 
-        if not Path(record.wav_path).exists():
+        if audio.size == 0:
             logging.error("no audio captured")
             self.set_state(State.IDLE)
             return
@@ -146,7 +146,7 @@ class Daemon:
         try:
             self.set_state(State.STT)
             logging.info("stt started")
-            text = self._transcribe(record.wav_path)
+            text = self._transcribe(audio)
         except Exception as exc:
             logging.error("stt failed: %s", exc)
             self.set_state(State.IDLE)
@@ -199,8 +199,8 @@ class Daemon:
             self.state = State.STT
         threading.Thread(target=self._stop_and_process, daemon=True).start()
 
-    def _transcribe(self, wav_path: str) -> str:
-        segments, _info = self.model.transcribe(wav_path, language=STT_LANGUAGE, vad_filter=True)
+    def _transcribe(self, audio) -> str:
+        segments, _info = self.model.transcribe(audio, language=STT_LANGUAGE, vad_filter=True)
         parts = []
         for seg in segments:
             text = (seg.text or "").strip()
diff --git a/src/recorder.py b/src/recorder.py
index 3fc80b1..4b2054c 100644
--- a/src/recorder.py
+++ b/src/recorder.py
@@ -1,21 +1,16 @@
-import tempfile
 from dataclasses import dataclass, field
-from pathlib import Path
 from typing import Iterable
 
 import numpy as np
 import sounddevice as sd  # type: ignore[import-not-found]
-import soundfile as sf  # type: ignore[import-not-found]
 
 
 @dataclass
 class RecordResult:
-    wav_path: str
-    temp_dir: str
     frames: list[np.ndarray] = field(default_factory=list)
     samplerate: int = 16000
     channels: int = 1
-    dtype: str = "int16"
+    dtype: str = "float32"
 
 
 def list_input_devices() -> list[dict]:
@@ -54,9 +49,7 @@ def resolve_input_device(spec: str | int | None) -> int | None:
 
 
 def start_recording(input_spec: str | int | None) -> tuple[sd.InputStream, RecordResult]:
-    tmpdir = tempfile.mkdtemp(prefix="lel-")
-    wav = str(Path(tmpdir) / "mic.wav")
-    record = RecordResult(wav_path=wav, temp_dir=tmpdir)
+    record = RecordResult()
     device = resolve_input_device(input_spec)
 
     def callback(indata, _frames, _time, _status):
@@ -73,20 +66,18 @@ def start_recording(input_spec: str | int | None) -> tuple[sd.InputStream, Recor
     return stream, record
 
 
-def stop_recording(stream: sd.InputStream, record: RecordResult) -> None:
+def stop_recording(stream: sd.InputStream, record: RecordResult) -> np.ndarray:
     if stream:
         stream.stop()
         stream.close()
-    _write_wav(record)
-
-
-def _write_wav(record: RecordResult) -> None:
-    data = _flatten_frames(record.frames)
-    sf.write(record.wav_path, data, record.samplerate, subtype="PCM_16")
+    return _flatten_frames(record.frames)
 
 
 def _flatten_frames(frames: Iterable[np.ndarray]) -> np.ndarray:
     frames = list(frames)
     if not frames:
-        return np.zeros((0, 1), dtype=np.int16)
-    return np.concatenate(frames, axis=0)
+        return np.zeros((0,), dtype=np.float32)
+    data = np.concatenate(frames, axis=0)
+    if data.ndim > 1:
+        data = np.squeeze(data, axis=-1)
+    return np.asarray(data, dtype=np.float32).reshape(-1)
diff --git a/uv.lock b/uv.lock
index c35c48a..772833c 100644
--- a/uv.lock
+++ b/uv.lock
@@ -392,7 +392,6 @@ dependencies = [
     { name = "pygobject" },
     { name = "python-xlib" },
     { name = "sounddevice" },
-    { name = "soundfile" },
 ]
 
 [package.metadata]
@@ -402,7 +401,6 @@ requires-dist = [
     { name = "pygobject" },
     { name = "python-xlib" },
     { name = "sounddevice" },
-    { name = "soundfile" },
 ]
 
 [[package]]
@@ -880,26 +878,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/4e/39/a61d4b83a7746b70d23d9173be688c0c6bfc7173772344b7442c2c155497/sounddevice-0.5.5-py3-none-win_arm64.whl", hash = "sha256:3861901ddd8230d2e0e8ae62ac320cdd4c688d81df89da036dcb812f757bb3e6", size = 317115, upload-time = "2026-01-23T18:36:42.235Z" },
 ]
 
-[[package]]
-name = "soundfile"
-version = "0.13.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "cffi" },
-    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
-    { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/e1/41/9b873a8c055582859b239be17902a85339bec6a30ad162f98c9b0288a2cc/soundfile-0.13.1.tar.gz", hash = "sha256:b2c68dab1e30297317080a5b43df57e302584c49e2942defdde0acccc53f0e5b", size = 46156, upload-time = "2025-01-25T09:17:04.831Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/64/28/e2a36573ccbcf3d57c00626a21fe51989380636e821b341d36ccca0c1c3a/soundfile-0.13.1-py2.py3-none-any.whl", hash = "sha256:a23c717560da2cf4c7b5ae1142514e0fd82d6bbd9dfc93a50423447142f2c445", size = 25751, upload-time = "2025-01-25T09:16:44.235Z" },
-    { url = "https://files.pythonhosted.org/packages/ea/ab/73e97a5b3cc46bba7ff8650a1504348fa1863a6f9d57d7001c6b67c5f20e/soundfile-0.13.1-py2.py3-none-macosx_10_9_x86_64.whl", hash = "sha256:82dc664d19831933fe59adad199bf3945ad06d84bc111a5b4c0d3089a5b9ec33", size = 1142250, upload-time = "2025-01-25T09:16:47.583Z" },
-    { url = "https://files.pythonhosted.org/packages/a0/e5/58fd1a8d7b26fc113af244f966ee3aecf03cb9293cb935daaddc1e455e18/soundfile-0.13.1-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:743f12c12c4054921e15736c6be09ac26b3b3d603aef6fd69f9dde68748f2593", size = 1101406, upload-time = "2025-01-25T09:16:49.662Z" },
-    { url = "https://files.pythonhosted.org/packages/58/ae/c0e4a53d77cf6e9a04179535766b3321b0b9ced5f70522e4caf9329f0046/soundfile-0.13.1-py2.py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:9c9e855f5a4d06ce4213f31918653ab7de0c5a8d8107cd2427e44b42df547deb", size = 1235729, upload-time = "2025-01-25T09:16:53.018Z" },
-    { url = "https://files.pythonhosted.org/packages/57/5e/70bdd9579b35003a489fc850b5047beeda26328053ebadc1fb60f320f7db/soundfile-0.13.1-py2.py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:03267c4e493315294834a0870f31dbb3b28a95561b80b134f0bd3cf2d5f0e618", size = 1313646, upload-time = "2025-01-25T09:16:54.872Z" },
-    { url = "https://files.pythonhosted.org/packages/fe/df/8c11dc4dfceda14e3003bb81a0d0edcaaf0796dd7b4f826ea3e532146bba/soundfile-0.13.1-py2.py3-none-win32.whl", hash = "sha256:c734564fab7c5ddf8e9be5bf70bab68042cd17e9c214c06e365e20d64f9a69d5", size = 899881, upload-time = "2025-01-25T09:16:56.663Z" },
-    { url = "https://files.pythonhosted.org/packages/14/e9/6b761de83277f2f02ded7e7ea6f07828ec78e4b229b80e4ca55dd205b9dc/soundfile-0.13.1-py2.py3-none-win_amd64.whl", hash = "sha256:1e70a05a0626524a69e9f0f4dd2ec174b4e9567f4d8b6c11d38b5c289be36ee9", size = 1019162, upload-time = "2025-01-25T09:16:59.573Z" },
-]
-
 [[package]]
 name = "sympy"
 version = "1.14.0"