Use in-memory audio for STT

This commit is contained in:
Thales Maciel 2026-02-24 11:48:02 -03:00
parent 861f199dea
commit ebba452268
No known key found for this signature in database
GPG key ID: 33112E6833C34679
5 changed files with 17 additions and 50 deletions

View file

@ -6,12 +6,11 @@ Python X11 STT daemon that records audio, runs Whisper, logs the transcript, and
- X11 (not Wayland)
- `sounddevice` (PortAudio)
- `soundfile` (libsndfile)
- `faster-whisper`
- Tray icon deps: `gtk3`
- Python deps: `pillow`, `python-xlib`, `faster-whisper`, `PyGObject`, `sounddevice`, `soundfile`
- Python deps: `pillow`, `python-xlib`, `faster-whisper`, `PyGObject`, `sounddevice`
System packages (example names): `portaudio`/`libportaudio2` and `libsndfile`.
System packages (example names): `portaudio`/`libportaudio2`.
## Python Daemon

View file

@ -10,7 +10,6 @@ dependencies = [
"python-xlib",
"PyGObject",
"sounddevice",
"soundfile",
]
[tool.uv]

View file

@ -103,7 +103,7 @@ class Daemon:
self.proc = proc
self.record = record
self.state = State.RECORDING
logging.info("recording started (%s)", record.wav_path)
logging.info("recording started")
if self.timer:
self.timer.cancel()
self.timer = threading.Timer(RECORD_TIMEOUT_SEC, self._timeout_stop)
@ -132,13 +132,13 @@ class Daemon:
logging.info("stopping recording (user)")
try:
stop_recording(proc, record)
audio = stop_recording(proc, record)
except Exception as exc:
logging.error("record stop failed: %s", exc)
self.set_state(State.IDLE)
return
if not Path(record.wav_path).exists():
if audio.size == 0:
logging.error("no audio captured")
self.set_state(State.IDLE)
return
@ -146,7 +146,7 @@ class Daemon:
try:
self.set_state(State.STT)
logging.info("stt started")
text = self._transcribe(record.wav_path)
text = self._transcribe(audio)
except Exception as exc:
logging.error("stt failed: %s", exc)
self.set_state(State.IDLE)
@ -199,8 +199,8 @@ class Daemon:
self.state = State.STT
threading.Thread(target=self._stop_and_process, daemon=True).start()
def _transcribe(self, wav_path: str) -> str:
segments, _info = self.model.transcribe(wav_path, language=STT_LANGUAGE, vad_filter=True)
def _transcribe(self, audio) -> str:
segments, _info = self.model.transcribe(audio, language=STT_LANGUAGE, vad_filter=True)
parts = []
for seg in segments:
text = (seg.text or "").strip()

View file

@ -1,21 +1,16 @@
import tempfile
from dataclasses import dataclass, field
from pathlib import Path
from typing import Iterable
import numpy as np
import sounddevice as sd # type: ignore[import-not-found]
import soundfile as sf # type: ignore[import-not-found]
@dataclass
class RecordResult:
wav_path: str
temp_dir: str
frames: list[np.ndarray] = field(default_factory=list)
samplerate: int = 16000
channels: int = 1
dtype: str = "int16"
dtype: str = "float32"
def list_input_devices() -> list[dict]:
@ -54,9 +49,7 @@ def resolve_input_device(spec: str | int | None) -> int | None:
def start_recording(input_spec: str | int | None) -> tuple[sd.InputStream, RecordResult]:
tmpdir = tempfile.mkdtemp(prefix="lel-")
wav = str(Path(tmpdir) / "mic.wav")
record = RecordResult(wav_path=wav, temp_dir=tmpdir)
record = RecordResult()
device = resolve_input_device(input_spec)
def callback(indata, _frames, _time, _status):
@ -73,20 +66,18 @@ def start_recording(input_spec: str | int | None) -> tuple[sd.InputStream, Recor
return stream, record
def stop_recording(stream: sd.InputStream, record: RecordResult) -> None:
def stop_recording(stream: sd.InputStream, record: RecordResult) -> np.ndarray:
if stream:
stream.stop()
stream.close()
_write_wav(record)
def _write_wav(record: RecordResult) -> None:
data = _flatten_frames(record.frames)
sf.write(record.wav_path, data, record.samplerate, subtype="PCM_16")
return _flatten_frames(record.frames)
def _flatten_frames(frames: Iterable[np.ndarray]) -> np.ndarray:
frames = list(frames)
if not frames:
return np.zeros((0, 1), dtype=np.int16)
return np.concatenate(frames, axis=0)
return np.zeros((0,), dtype=np.float32)
data = np.concatenate(frames, axis=0)
if data.ndim > 1:
data = np.squeeze(data, axis=-1)
return np.asarray(data, dtype=np.float32).reshape(-1)

22
uv.lock generated
View file

@ -392,7 +392,6 @@ dependencies = [
{ name = "pygobject" },
{ name = "python-xlib" },
{ name = "sounddevice" },
{ name = "soundfile" },
]
[package.metadata]
@ -402,7 +401,6 @@ requires-dist = [
{ name = "pygobject" },
{ name = "python-xlib" },
{ name = "sounddevice" },
{ name = "soundfile" },
]
[[package]]
@ -880,26 +878,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/4e/39/a61d4b83a7746b70d23d9173be688c0c6bfc7173772344b7442c2c155497/sounddevice-0.5.5-py3-none-win_arm64.whl", hash = "sha256:3861901ddd8230d2e0e8ae62ac320cdd4c688d81df89da036dcb812f757bb3e6", size = 317115, upload-time = "2026-01-23T18:36:42.235Z" },
]
[[package]]
name = "soundfile"
version = "0.13.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "cffi" },
{ name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
{ name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/e1/41/9b873a8c055582859b239be17902a85339bec6a30ad162f98c9b0288a2cc/soundfile-0.13.1.tar.gz", hash = "sha256:b2c68dab1e30297317080a5b43df57e302584c49e2942defdde0acccc53f0e5b", size = 46156, upload-time = "2025-01-25T09:17:04.831Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/64/28/e2a36573ccbcf3d57c00626a21fe51989380636e821b341d36ccca0c1c3a/soundfile-0.13.1-py2.py3-none-any.whl", hash = "sha256:a23c717560da2cf4c7b5ae1142514e0fd82d6bbd9dfc93a50423447142f2c445", size = 25751, upload-time = "2025-01-25T09:16:44.235Z" },
{ url = "https://files.pythonhosted.org/packages/ea/ab/73e97a5b3cc46bba7ff8650a1504348fa1863a6f9d57d7001c6b67c5f20e/soundfile-0.13.1-py2.py3-none-macosx_10_9_x86_64.whl", hash = "sha256:82dc664d19831933fe59adad199bf3945ad06d84bc111a5b4c0d3089a5b9ec33", size = 1142250, upload-time = "2025-01-25T09:16:47.583Z" },
{ url = "https://files.pythonhosted.org/packages/a0/e5/58fd1a8d7b26fc113af244f966ee3aecf03cb9293cb935daaddc1e455e18/soundfile-0.13.1-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:743f12c12c4054921e15736c6be09ac26b3b3d603aef6fd69f9dde68748f2593", size = 1101406, upload-time = "2025-01-25T09:16:49.662Z" },
{ url = "https://files.pythonhosted.org/packages/58/ae/c0e4a53d77cf6e9a04179535766b3321b0b9ced5f70522e4caf9329f0046/soundfile-0.13.1-py2.py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:9c9e855f5a4d06ce4213f31918653ab7de0c5a8d8107cd2427e44b42df547deb", size = 1235729, upload-time = "2025-01-25T09:16:53.018Z" },
{ url = "https://files.pythonhosted.org/packages/57/5e/70bdd9579b35003a489fc850b5047beeda26328053ebadc1fb60f320f7db/soundfile-0.13.1-py2.py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:03267c4e493315294834a0870f31dbb3b28a95561b80b134f0bd3cf2d5f0e618", size = 1313646, upload-time = "2025-01-25T09:16:54.872Z" },
{ url = "https://files.pythonhosted.org/packages/fe/df/8c11dc4dfceda14e3003bb81a0d0edcaaf0796dd7b4f826ea3e532146bba/soundfile-0.13.1-py2.py3-none-win32.whl", hash = "sha256:c734564fab7c5ddf8e9be5bf70bab68042cd17e9c214c06e365e20d64f9a69d5", size = 899881, upload-time = "2025-01-25T09:16:56.663Z" },
{ url = "https://files.pythonhosted.org/packages/14/e9/6b761de83277f2f02ded7e7ea6f07828ec78e4b229b80e4ca55dd205b9dc/soundfile-0.13.1-py2.py3-none-win_amd64.whl", hash = "sha256:1e70a05a0626524a69e9f0f4dd2ec174b4e9567f4d8b6c11d38b5c289be36ee9", size = 1019162, upload-time = "2025-01-25T09:16:59.573Z" },
]
[[package]]
name = "sympy"
version = "1.14.0"