Use in-memory audio for STT
This commit is contained in:
parent
861f199dea
commit
ebba452268
5 changed files with 17 additions and 50 deletions
|
|
@ -6,12 +6,11 @@ Python X11 STT daemon that records audio, runs Whisper, logs the transcript, and
|
|||
|
||||
- X11 (not Wayland)
|
||||
- `sounddevice` (PortAudio)
|
||||
- `soundfile` (libsndfile)
|
||||
- `faster-whisper`
|
||||
- Tray icon deps: `gtk3`
|
||||
- Python deps: `pillow`, `python-xlib`, `faster-whisper`, `PyGObject`, `sounddevice`, `soundfile`
|
||||
- Python deps: `pillow`, `python-xlib`, `faster-whisper`, `PyGObject`, `sounddevice`
|
||||
|
||||
System packages (example names): `portaudio`/`libportaudio2` and `libsndfile`.
|
||||
System packages (example names): `portaudio`/`libportaudio2`.
|
||||
|
||||
## Python Daemon
|
||||
|
||||
|
|
|
|||
|
|
@ -10,7 +10,6 @@ dependencies = [
|
|||
"python-xlib",
|
||||
"PyGObject",
|
||||
"sounddevice",
|
||||
"soundfile",
|
||||
]
|
||||
|
||||
[tool.uv]
|
||||
|
|
|
|||
12
src/leld.py
12
src/leld.py
|
|
@ -103,7 +103,7 @@ class Daemon:
|
|||
self.proc = proc
|
||||
self.record = record
|
||||
self.state = State.RECORDING
|
||||
logging.info("recording started (%s)", record.wav_path)
|
||||
logging.info("recording started")
|
||||
if self.timer:
|
||||
self.timer.cancel()
|
||||
self.timer = threading.Timer(RECORD_TIMEOUT_SEC, self._timeout_stop)
|
||||
|
|
@ -132,13 +132,13 @@ class Daemon:
|
|||
|
||||
logging.info("stopping recording (user)")
|
||||
try:
|
||||
stop_recording(proc, record)
|
||||
audio = stop_recording(proc, record)
|
||||
except Exception as exc:
|
||||
logging.error("record stop failed: %s", exc)
|
||||
self.set_state(State.IDLE)
|
||||
return
|
||||
|
||||
if not Path(record.wav_path).exists():
|
||||
if audio.size == 0:
|
||||
logging.error("no audio captured")
|
||||
self.set_state(State.IDLE)
|
||||
return
|
||||
|
|
@ -146,7 +146,7 @@ class Daemon:
|
|||
try:
|
||||
self.set_state(State.STT)
|
||||
logging.info("stt started")
|
||||
text = self._transcribe(record.wav_path)
|
||||
text = self._transcribe(audio)
|
||||
except Exception as exc:
|
||||
logging.error("stt failed: %s", exc)
|
||||
self.set_state(State.IDLE)
|
||||
|
|
@ -199,8 +199,8 @@ class Daemon:
|
|||
self.state = State.STT
|
||||
threading.Thread(target=self._stop_and_process, daemon=True).start()
|
||||
|
||||
def _transcribe(self, wav_path: str) -> str:
|
||||
segments, _info = self.model.transcribe(wav_path, language=STT_LANGUAGE, vad_filter=True)
|
||||
def _transcribe(self, audio) -> str:
|
||||
segments, _info = self.model.transcribe(audio, language=STT_LANGUAGE, vad_filter=True)
|
||||
parts = []
|
||||
for seg in segments:
|
||||
text = (seg.text or "").strip()
|
||||
|
|
|
|||
|
|
@ -1,21 +1,16 @@
|
|||
import tempfile
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
import numpy as np
|
||||
import sounddevice as sd # type: ignore[import-not-found]
|
||||
import soundfile as sf # type: ignore[import-not-found]
|
||||
|
||||
|
||||
@dataclass
|
||||
class RecordResult:
|
||||
wav_path: str
|
||||
temp_dir: str
|
||||
frames: list[np.ndarray] = field(default_factory=list)
|
||||
samplerate: int = 16000
|
||||
channels: int = 1
|
||||
dtype: str = "int16"
|
||||
dtype: str = "float32"
|
||||
|
||||
|
||||
def list_input_devices() -> list[dict]:
|
||||
|
|
@ -54,9 +49,7 @@ def resolve_input_device(spec: str | int | None) -> int | None:
|
|||
|
||||
|
||||
def start_recording(input_spec: str | int | None) -> tuple[sd.InputStream, RecordResult]:
|
||||
tmpdir = tempfile.mkdtemp(prefix="lel-")
|
||||
wav = str(Path(tmpdir) / "mic.wav")
|
||||
record = RecordResult(wav_path=wav, temp_dir=tmpdir)
|
||||
record = RecordResult()
|
||||
device = resolve_input_device(input_spec)
|
||||
|
||||
def callback(indata, _frames, _time, _status):
|
||||
|
|
@ -73,20 +66,18 @@ def start_recording(input_spec: str | int | None) -> tuple[sd.InputStream, Recor
|
|||
return stream, record
|
||||
|
||||
|
||||
def stop_recording(stream: sd.InputStream, record: RecordResult) -> None:
|
||||
def stop_recording(stream: sd.InputStream, record: RecordResult) -> np.ndarray:
|
||||
if stream:
|
||||
stream.stop()
|
||||
stream.close()
|
||||
_write_wav(record)
|
||||
|
||||
|
||||
def _write_wav(record: RecordResult) -> None:
|
||||
data = _flatten_frames(record.frames)
|
||||
sf.write(record.wav_path, data, record.samplerate, subtype="PCM_16")
|
||||
return _flatten_frames(record.frames)
|
||||
|
||||
|
||||
def _flatten_frames(frames: Iterable[np.ndarray]) -> np.ndarray:
|
||||
frames = list(frames)
|
||||
if not frames:
|
||||
return np.zeros((0, 1), dtype=np.int16)
|
||||
return np.concatenate(frames, axis=0)
|
||||
return np.zeros((0,), dtype=np.float32)
|
||||
data = np.concatenate(frames, axis=0)
|
||||
if data.ndim > 1:
|
||||
data = np.squeeze(data, axis=-1)
|
||||
return np.asarray(data, dtype=np.float32).reshape(-1)
|
||||
|
|
|
|||
22
uv.lock
generated
22
uv.lock
generated
|
|
@ -392,7 +392,6 @@ dependencies = [
|
|||
{ name = "pygobject" },
|
||||
{ name = "python-xlib" },
|
||||
{ name = "sounddevice" },
|
||||
{ name = "soundfile" },
|
||||
]
|
||||
|
||||
[package.metadata]
|
||||
|
|
@ -402,7 +401,6 @@ requires-dist = [
|
|||
{ name = "pygobject" },
|
||||
{ name = "python-xlib" },
|
||||
{ name = "sounddevice" },
|
||||
{ name = "soundfile" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
@ -880,26 +878,6 @@ wheels = [
|
|||
{ url = "https://files.pythonhosted.org/packages/4e/39/a61d4b83a7746b70d23d9173be688c0c6bfc7173772344b7442c2c155497/sounddevice-0.5.5-py3-none-win_arm64.whl", hash = "sha256:3861901ddd8230d2e0e8ae62ac320cdd4c688d81df89da036dcb812f757bb3e6", size = 317115, upload-time = "2026-01-23T18:36:42.235Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "soundfile"
|
||||
version = "0.13.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "cffi" },
|
||||
{ name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
|
||||
{ name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/e1/41/9b873a8c055582859b239be17902a85339bec6a30ad162f98c9b0288a2cc/soundfile-0.13.1.tar.gz", hash = "sha256:b2c68dab1e30297317080a5b43df57e302584c49e2942defdde0acccc53f0e5b", size = 46156, upload-time = "2025-01-25T09:17:04.831Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/64/28/e2a36573ccbcf3d57c00626a21fe51989380636e821b341d36ccca0c1c3a/soundfile-0.13.1-py2.py3-none-any.whl", hash = "sha256:a23c717560da2cf4c7b5ae1142514e0fd82d6bbd9dfc93a50423447142f2c445", size = 25751, upload-time = "2025-01-25T09:16:44.235Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ea/ab/73e97a5b3cc46bba7ff8650a1504348fa1863a6f9d57d7001c6b67c5f20e/soundfile-0.13.1-py2.py3-none-macosx_10_9_x86_64.whl", hash = "sha256:82dc664d19831933fe59adad199bf3945ad06d84bc111a5b4c0d3089a5b9ec33", size = 1142250, upload-time = "2025-01-25T09:16:47.583Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a0/e5/58fd1a8d7b26fc113af244f966ee3aecf03cb9293cb935daaddc1e455e18/soundfile-0.13.1-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:743f12c12c4054921e15736c6be09ac26b3b3d603aef6fd69f9dde68748f2593", size = 1101406, upload-time = "2025-01-25T09:16:49.662Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/58/ae/c0e4a53d77cf6e9a04179535766b3321b0b9ced5f70522e4caf9329f0046/soundfile-0.13.1-py2.py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:9c9e855f5a4d06ce4213f31918653ab7de0c5a8d8107cd2427e44b42df547deb", size = 1235729, upload-time = "2025-01-25T09:16:53.018Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/57/5e/70bdd9579b35003a489fc850b5047beeda26328053ebadc1fb60f320f7db/soundfile-0.13.1-py2.py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:03267c4e493315294834a0870f31dbb3b28a95561b80b134f0bd3cf2d5f0e618", size = 1313646, upload-time = "2025-01-25T09:16:54.872Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/fe/df/8c11dc4dfceda14e3003bb81a0d0edcaaf0796dd7b4f826ea3e532146bba/soundfile-0.13.1-py2.py3-none-win32.whl", hash = "sha256:c734564fab7c5ddf8e9be5bf70bab68042cd17e9c214c06e365e20d64f9a69d5", size = 899881, upload-time = "2025-01-25T09:16:56.663Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/14/e9/6b761de83277f2f02ded7e7ea6f07828ec78e4b229b80e4ca55dd205b9dc/soundfile-0.13.1-py2.py3-none-win_amd64.whl", hash = "sha256:1e70a05a0626524a69e9f0f4dd2ec174b4e9567f4d8b6c11d38b5c289be36ee9", size = 1019162, upload-time = "2025-01-25T09:16:59.573Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sympy"
|
||||
version = "1.14.0"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue