Use in-memory audio for STT

This commit is contained in:
Thales Maciel 2026-02-24 11:48:02 -03:00
parent 861f199dea
commit ebba452268
No known key found for this signature in database
GPG key ID: 33112E6833C34679
5 changed files with 17 additions and 50 deletions

View file

@ -6,12 +6,11 @@ Python X11 STT daemon that records audio, runs Whisper, logs the transcript, and
- X11 (not Wayland) - X11 (not Wayland)
- `sounddevice` (PortAudio) - `sounddevice` (PortAudio)
- `soundfile` (libsndfile)
- `faster-whisper` - `faster-whisper`
- Tray icon deps: `gtk3` - Tray icon deps: `gtk3`
- Python deps: `pillow`, `python-xlib`, `faster-whisper`, `PyGObject`, `sounddevice`, `soundfile` - Python deps: `pillow`, `python-xlib`, `faster-whisper`, `PyGObject`, `sounddevice`
System packages (example names): `portaudio`/`libportaudio2` and `libsndfile`. System packages (example names): `portaudio`/`libportaudio2`.
## Python Daemon ## Python Daemon

View file

@ -10,7 +10,6 @@ dependencies = [
"python-xlib", "python-xlib",
"PyGObject", "PyGObject",
"sounddevice", "sounddevice",
"soundfile",
] ]
[tool.uv] [tool.uv]

View file

@ -103,7 +103,7 @@ class Daemon:
self.proc = proc self.proc = proc
self.record = record self.record = record
self.state = State.RECORDING self.state = State.RECORDING
logging.info("recording started (%s)", record.wav_path) logging.info("recording started")
if self.timer: if self.timer:
self.timer.cancel() self.timer.cancel()
self.timer = threading.Timer(RECORD_TIMEOUT_SEC, self._timeout_stop) self.timer = threading.Timer(RECORD_TIMEOUT_SEC, self._timeout_stop)
@ -132,13 +132,13 @@ class Daemon:
logging.info("stopping recording (user)") logging.info("stopping recording (user)")
try: try:
stop_recording(proc, record) audio = stop_recording(proc, record)
except Exception as exc: except Exception as exc:
logging.error("record stop failed: %s", exc) logging.error("record stop failed: %s", exc)
self.set_state(State.IDLE) self.set_state(State.IDLE)
return return
if not Path(record.wav_path).exists(): if audio.size == 0:
logging.error("no audio captured") logging.error("no audio captured")
self.set_state(State.IDLE) self.set_state(State.IDLE)
return return
@ -146,7 +146,7 @@ class Daemon:
try: try:
self.set_state(State.STT) self.set_state(State.STT)
logging.info("stt started") logging.info("stt started")
text = self._transcribe(record.wav_path) text = self._transcribe(audio)
except Exception as exc: except Exception as exc:
logging.error("stt failed: %s", exc) logging.error("stt failed: %s", exc)
self.set_state(State.IDLE) self.set_state(State.IDLE)
@ -199,8 +199,8 @@ class Daemon:
self.state = State.STT self.state = State.STT
threading.Thread(target=self._stop_and_process, daemon=True).start() threading.Thread(target=self._stop_and_process, daemon=True).start()
def _transcribe(self, wav_path: str) -> str: def _transcribe(self, audio) -> str:
segments, _info = self.model.transcribe(wav_path, language=STT_LANGUAGE, vad_filter=True) segments, _info = self.model.transcribe(audio, language=STT_LANGUAGE, vad_filter=True)
parts = [] parts = []
for seg in segments: for seg in segments:
text = (seg.text or "").strip() text = (seg.text or "").strip()

View file

@ -1,21 +1,16 @@
import tempfile
from dataclasses import dataclass, field from dataclasses import dataclass, field
from pathlib import Path
from typing import Iterable from typing import Iterable
import numpy as np import numpy as np
import sounddevice as sd # type: ignore[import-not-found] import sounddevice as sd # type: ignore[import-not-found]
import soundfile as sf # type: ignore[import-not-found]
@dataclass @dataclass
class RecordResult: class RecordResult:
wav_path: str
temp_dir: str
frames: list[np.ndarray] = field(default_factory=list) frames: list[np.ndarray] = field(default_factory=list)
samplerate: int = 16000 samplerate: int = 16000
channels: int = 1 channels: int = 1
dtype: str = "int16" dtype: str = "float32"
def list_input_devices() -> list[dict]: def list_input_devices() -> list[dict]:
@ -54,9 +49,7 @@ def resolve_input_device(spec: str | int | None) -> int | None:
def start_recording(input_spec: str | int | None) -> tuple[sd.InputStream, RecordResult]: def start_recording(input_spec: str | int | None) -> tuple[sd.InputStream, RecordResult]:
tmpdir = tempfile.mkdtemp(prefix="lel-") record = RecordResult()
wav = str(Path(tmpdir) / "mic.wav")
record = RecordResult(wav_path=wav, temp_dir=tmpdir)
device = resolve_input_device(input_spec) device = resolve_input_device(input_spec)
def callback(indata, _frames, _time, _status): def callback(indata, _frames, _time, _status):
@ -73,20 +66,18 @@ def start_recording(input_spec: str | int | None) -> tuple[sd.InputStream, Recor
return stream, record return stream, record
def stop_recording(stream: sd.InputStream, record: RecordResult) -> None: def stop_recording(stream: sd.InputStream, record: RecordResult) -> np.ndarray:
if stream: if stream:
stream.stop() stream.stop()
stream.close() stream.close()
_write_wav(record) return _flatten_frames(record.frames)
def _write_wav(record: RecordResult) -> None:
data = _flatten_frames(record.frames)
sf.write(record.wav_path, data, record.samplerate, subtype="PCM_16")
def _flatten_frames(frames: Iterable[np.ndarray]) -> np.ndarray: def _flatten_frames(frames: Iterable[np.ndarray]) -> np.ndarray:
frames = list(frames) frames = list(frames)
if not frames: if not frames:
return np.zeros((0, 1), dtype=np.int16) return np.zeros((0,), dtype=np.float32)
return np.concatenate(frames, axis=0) data = np.concatenate(frames, axis=0)
if data.ndim > 1:
data = np.squeeze(data, axis=-1)
return np.asarray(data, dtype=np.float32).reshape(-1)

22
uv.lock generated
View file

@ -392,7 +392,6 @@ dependencies = [
{ name = "pygobject" }, { name = "pygobject" },
{ name = "python-xlib" }, { name = "python-xlib" },
{ name = "sounddevice" }, { name = "sounddevice" },
{ name = "soundfile" },
] ]
[package.metadata] [package.metadata]
@ -402,7 +401,6 @@ requires-dist = [
{ name = "pygobject" }, { name = "pygobject" },
{ name = "python-xlib" }, { name = "python-xlib" },
{ name = "sounddevice" }, { name = "sounddevice" },
{ name = "soundfile" },
] ]
[[package]] [[package]]
@ -880,26 +878,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/4e/39/a61d4b83a7746b70d23d9173be688c0c6bfc7173772344b7442c2c155497/sounddevice-0.5.5-py3-none-win_arm64.whl", hash = "sha256:3861901ddd8230d2e0e8ae62ac320cdd4c688d81df89da036dcb812f757bb3e6", size = 317115, upload-time = "2026-01-23T18:36:42.235Z" }, { url = "https://files.pythonhosted.org/packages/4e/39/a61d4b83a7746b70d23d9173be688c0c6bfc7173772344b7442c2c155497/sounddevice-0.5.5-py3-none-win_arm64.whl", hash = "sha256:3861901ddd8230d2e0e8ae62ac320cdd4c688d81df89da036dcb812f757bb3e6", size = 317115, upload-time = "2026-01-23T18:36:42.235Z" },
] ]
[[package]]
name = "soundfile"
version = "0.13.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "cffi" },
{ name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
{ name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/e1/41/9b873a8c055582859b239be17902a85339bec6a30ad162f98c9b0288a2cc/soundfile-0.13.1.tar.gz", hash = "sha256:b2c68dab1e30297317080a5b43df57e302584c49e2942defdde0acccc53f0e5b", size = 46156, upload-time = "2025-01-25T09:17:04.831Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/64/28/e2a36573ccbcf3d57c00626a21fe51989380636e821b341d36ccca0c1c3a/soundfile-0.13.1-py2.py3-none-any.whl", hash = "sha256:a23c717560da2cf4c7b5ae1142514e0fd82d6bbd9dfc93a50423447142f2c445", size = 25751, upload-time = "2025-01-25T09:16:44.235Z" },
{ url = "https://files.pythonhosted.org/packages/ea/ab/73e97a5b3cc46bba7ff8650a1504348fa1863a6f9d57d7001c6b67c5f20e/soundfile-0.13.1-py2.py3-none-macosx_10_9_x86_64.whl", hash = "sha256:82dc664d19831933fe59adad199bf3945ad06d84bc111a5b4c0d3089a5b9ec33", size = 1142250, upload-time = "2025-01-25T09:16:47.583Z" },
{ url = "https://files.pythonhosted.org/packages/a0/e5/58fd1a8d7b26fc113af244f966ee3aecf03cb9293cb935daaddc1e455e18/soundfile-0.13.1-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:743f12c12c4054921e15736c6be09ac26b3b3d603aef6fd69f9dde68748f2593", size = 1101406, upload-time = "2025-01-25T09:16:49.662Z" },
{ url = "https://files.pythonhosted.org/packages/58/ae/c0e4a53d77cf6e9a04179535766b3321b0b9ced5f70522e4caf9329f0046/soundfile-0.13.1-py2.py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:9c9e855f5a4d06ce4213f31918653ab7de0c5a8d8107cd2427e44b42df547deb", size = 1235729, upload-time = "2025-01-25T09:16:53.018Z" },
{ url = "https://files.pythonhosted.org/packages/57/5e/70bdd9579b35003a489fc850b5047beeda26328053ebadc1fb60f320f7db/soundfile-0.13.1-py2.py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:03267c4e493315294834a0870f31dbb3b28a95561b80b134f0bd3cf2d5f0e618", size = 1313646, upload-time = "2025-01-25T09:16:54.872Z" },
{ url = "https://files.pythonhosted.org/packages/fe/df/8c11dc4dfceda14e3003bb81a0d0edcaaf0796dd7b4f826ea3e532146bba/soundfile-0.13.1-py2.py3-none-win32.whl", hash = "sha256:c734564fab7c5ddf8e9be5bf70bab68042cd17e9c214c06e365e20d64f9a69d5", size = 899881, upload-time = "2025-01-25T09:16:56.663Z" },
{ url = "https://files.pythonhosted.org/packages/14/e9/6b761de83277f2f02ded7e7ea6f07828ec78e4b229b80e4ca55dd205b9dc/soundfile-0.13.1-py2.py3-none-win_amd64.whl", hash = "sha256:1e70a05a0626524a69e9f0f4dd2ec174b4e9567f4d8b6c11d38b5c289be36ee9", size = 1019162, upload-time = "2025-01-25T09:16:59.573Z" },
]
[[package]] [[package]]
name = "sympy" name = "sympy"
version = "1.14.0" version = "1.14.0"